From dd5d226f6a377fbf3f98f714323921539a418d83 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 14 Nov 2024 15:52:59 +0500 Subject: [PATCH] fix: Count unique texts, data leaks in calculate metrics (#1438) * add more stat * add more stat * update statistics --- mteb/abstasks/AbsTask.py | 12 +- mteb/abstasks/AbsTaskBitextMining.py | 39 +- mteb/abstasks/AbsTaskClassification.py | 29 +- mteb/abstasks/AbsTaskClustering.py | 26 +- mteb/abstasks/AbsTaskClusteringFast.py | 21 +- mteb/abstasks/AbsTaskInstructionRetrieval.py | 98 +- .../AbsTaskMultilabelClassification.py | 42 +- mteb/abstasks/AbsTaskPairClassification.py | 42 +- mteb/abstasks/AbsTaskReranking.py | 60 +- mteb/abstasks/AbsTaskRetrieval.py | 58 +- mteb/abstasks/AbsTaskSTS.py | 34 +- mteb/abstasks/AbsTaskSummarization.py | 76 +- .../BitextMining/BUCC.v2.json | 69 + .../BitextMining/BornholmBitextMining.json | 11 +- .../BitextMining/IN22ConvBitextMining.json | 5577 +++- .../BitextMining/IN22GenBitextMining.json | 6595 ++++ .../BitextMining/IWSLT2017BitextMining.json | 329 + .../IndicGenBenchFloresBitextMining.json | 1540 + .../BitextMining/NTREXBitextMining.json | 24925 ++++++++++++++++ .../BitextMining/NollySentiBitextMining.json | 69 + .../NorwegianCourtsBitextMining.json | 15 + .../NusaTranslationBitextMining.json | 132 +- .../BitextMining/PhincBitextMining.json | 30 + .../TbilisiCityHallBitextMining.json | 43 + .../BitextMining/VieMedEVBitextMining.json | 15 + .../LanguageClassification.json | 76 + .../SlovakHateSpeechClassification.json | 22 + .../ArXivHierarchicalClusteringP2P.json | 4 + .../Clustering/BiorxivClusteringS2S.json | 5 + .../Clustering/MedrxivClusteringP2P.v2.json | 168 + .../Clustering/MedrxivClusteringS2S.v2.json | 168 + .../Clustering/RedditClusteringP2P.v2.json | 1335 + .../RuSciBenchGRNTIClusteringP2P.json | 4 + .../TwentyNewsgroupsClustering.v2.json | 75 + .../Clustering/WikiClusteringP2P.json | 75 + .../Core17InstructionRetrieval.json | 18 +- .../CEDRClassification.json | 43 +- .../MultiEURLEXMultilabelClassification.json | 1732 -- .../PawsXPairClassification.json | 160 +- .../PairClassification/TwitterURLCorpus.json | 10 +- .../PairClassification/XNLI.json | 300 +- .../Reranking/AskUbuntuDupQuestions.json | 15 +- .../Reranking/ESCIReranking.json | 60 +- .../WikipediaRerankingMultilingual.json | 255 +- .../Retrieval/AppsRetrieval.json | 17 +- .../Retrieval/BelebeleRetrieval.json | 6413 +++- .../Retrieval/COIRCodeSearchNetRetrieval.json | 117 +- .../Retrieval/CodeEditSearchRetrieval.json | 236 +- .../Retrieval/CodeFeedbackMT.json | 17 +- .../Retrieval/CodeFeedbackST.json | 17 +- .../Retrieval/CodeSearchNetCCRetrieval.json | 117 +- .../Retrieval/CodeSearchNetRetrieval.json | 117 +- .../Retrieval/CodeTransOceanContest.json | 17 +- .../Retrieval/CodeTransOceanDL.json | 17 +- mteb/descriptive_stats/Retrieval/CosQA.json | 17 +- .../Retrieval/JaqketRetrieval.json | 17 +- .../descriptive_stats/Retrieval/NFCorpus.json | 11 + .../Retrieval/StackOverflowQA.json | 17 +- .../Retrieval/SyntheticText2SQL.json | 17 +- .../Retrieval/Touche2020.json | 17 +- .../Retrieval/Touche2020Retrieval.v3.json | 17 +- ...lowIRCrossLingualInstructionRetrieval.json | 70 +- .../mFollowIRInstructionRetrieval.json | 70 +- mteb/descriptive_stats/STS/STS12.json | 10 +- mteb/descriptive_stats/STS/STS17.json | 118 +- .../Summarization/SummEval.json | 53 +- tests/test_benchmark/mock_tasks.py | 652 +- tests/test_tasks/test_metadata.py | 4 +- 68 files changed, 47767 insertions(+), 4820 deletions(-) create mode 100644 mteb/descriptive_stats/BitextMining/BUCC.v2.json create mode 100644 mteb/descriptive_stats/BitextMining/IN22GenBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/IWSLT2017BitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/IndicGenBenchFloresBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/NTREXBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/NollySentiBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/NorwegianCourtsBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/PhincBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/TbilisiCityHallBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/VieMedEVBitextMining.json create mode 100644 mteb/descriptive_stats/Clustering/MedrxivClusteringP2P.v2.json create mode 100644 mteb/descriptive_stats/Clustering/MedrxivClusteringS2S.v2.json create mode 100644 mteb/descriptive_stats/Clustering/RedditClusteringP2P.v2.json create mode 100644 mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering.v2.json delete mode 100644 mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json create mode 100644 mteb/descriptive_stats/Retrieval/NFCorpus.json diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 0bdbdeaf8..8b9edfd52 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -200,7 +200,11 @@ def calculate_metadata_metrics( descriptive_stats = {} hf_subset_stat = "hf_subset_descriptive_stats" - pbar_split = tqdm.tqdm(self.metadata.eval_splits, desc="Processing Splits...") + eval_splits = self.metadata.eval_splits + if self.metadata.type in ["Classification", "MultilabelClassification"]: + eval_splits += ["train"] + + pbar_split = tqdm.tqdm(eval_splits, desc="Processing Splits...") for split in pbar_split: pbar_split.set_postfix_str(f"Split: {split}") logger.info(f"Processing metadata for split {split}") @@ -215,12 +219,8 @@ def calculate_metadata_metrics( if isinstance(self.metadata.eval_langs, dict) else self.metadata.eval_langs ) - if self.metadata.type == "Classification": - eval_langs += ["train"] - pbar_subsets = tqdm.tqdm( - self.metadata.eval_langs, desc="Processing Languages..." - ) + pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...") for hf_subset in pbar_subsets: pbar_subsets.set_postfix_str(f"Language: {hf_subset}") logger.info(f"Processing metadata for language {hf_subset}") diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index 00a9160b9..59d64039f 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -21,14 +21,31 @@ class BitextDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + unique_pairs: Number of duplicate pairs + + min_sentence1_length: Minimum length of sentence1 average_sentence1_length: Average length of sentence1 + max_sentence1_length: Maximum length of sentence1 + unique_sentence1: Number of duplicates in sentence1 + + min_sentence2_length: Minimum length of sentence2 average_sentence2_length: Average length of sentence2 + max_sentence2_length: Maximum length of sentence2 """ num_samples: int number_of_characters: int + unique_pairs: int + + min_sentence1_length: int average_sentence1_length: float + max_sentence1_length: int + unique_sentence1: int + + min_sentence2_length: int average_sentence2_length: float + max_sentence2_length: int + unique_sentence2: int class AbsTaskBitextMining(AbsTask): @@ -153,12 +170,24 @@ def _calculate_metrics_from_split( sent_1, sent_2 = pairs_cols[0] sentence1 = self.dataset[split][sent_1] sentence2 = self.dataset[split][sent_2] - total_s1_len = sum([len(s1) for s1 in sentence1]) - total_s2_len = sum([len(s2) for s2 in sentence2]) - + s1_len = [len(s1) for s1 in sentence1] + s2_len = [len(s2) for s2 in sentence2] + total_s1_len = sum(s1_len) + total_s2_len = sum(s2_len) + + unique_pairs = len(set(zip(sentence1, sentence2))) + unique_sentence1 = len(set(sentence1)) + unique_sentence2 = len(set(sentence2)) return BitextDescriptiveStatistics( - average_sentence1_length=total_s1_len / len(sentence1), - average_sentence2_length=total_s2_len / len(sentence2), num_samples=len(sentence1), number_of_characters=total_s1_len + total_s2_len, + unique_pairs=unique_pairs, + min_sentence1_length=min(s1_len), + average_sentence1_length=sum(s1_len) / len(sentence1), + max_sentence1_length=max(s1_len), + unique_sentence1=unique_sentence1, + min_sentence2_length=min(s2_len), + average_sentence2_length=total_s2_len / len(sentence2), + max_sentence2_length=max(s2_len), + unique_sentence2=unique_sentence2, ) diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py index 58b4441a1..62908c98a 100644 --- a/mteb/abstasks/AbsTaskClassification.py +++ b/mteb/abstasks/AbsTaskClassification.py @@ -26,14 +26,26 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + num_texts_in_train: Number of texts in the train split + + min_text_length: Minimum length of text average_text_length: Average length of text + max_text_length: Maximum length of text + unique_text: Number of unique texts + unique_labels: Number of unique labels labels: dict of label frequencies """ num_samples: int number_of_characters: int + num_texts_in_train: int | None + + min_text_length: int average_text_length: float + max_text_length: int + unique_text: int + unique_labels: int labels: dict[str, dict[str, int]] @@ -205,25 +217,40 @@ def _undersample_data(self, X, y, samples_per_label: int, idxs=None): def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> ClassificationDescriptiveStatistics: + train_text = [] if hf_subset: text = self.dataset[hf_subset][split]["text"] label = self.dataset[hf_subset][split]["label"] + if split != "train": + train_text = self.dataset[hf_subset]["train"]["text"] elif compute_overall: text = [] label = [] for hf_subset in self.metadata.eval_langs: text.extend(self.dataset[hf_subset][split]["text"]) label.extend(self.dataset[hf_subset][split]["label"]) + if split != "train": + train_text.extend(self.dataset[hf_subset]["train"]["text"]) else: text = self.dataset[split]["text"] label = self.dataset[split]["label"] + if split != "train": + train_text = self.dataset["train"]["text"] - total_text_len = sum([len(t) for t in text]) + text_len = [len(t) for t in text] + total_text_len = sum(text_len) label_count = Counter(label) + num_texts_in_train = ( + len(set(text) & set(train_text)) if split != "train" else None + ) return ClassificationDescriptiveStatistics( num_samples=len(text), number_of_characters=total_text_len, + num_texts_in_train=num_texts_in_train, + min_text_length=min(text_len), average_text_length=total_text_len / len(text), + max_text_length=max(text_len), + unique_text=len(set(text)), unique_labels=len(label_count), labels={ str(label): {"count": count} for label, count in label_count.items() diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py index 7f2c94e14..3b5d0f492 100644 --- a/mteb/abstasks/AbsTaskClustering.py +++ b/mteb/abstasks/AbsTaskClustering.py @@ -24,16 +24,31 @@ class ClusteringDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + + min_text_length: Minimum length of text average_text_length: Average length of text + max_text_length: Maximum length of text + unique_texts: Number of unique texts + + min_labels_per_text: Minimum number of labels per text average_labels_per_text: Average number of labels per text + max_labels_per_text: Maximum number of labels per text unique_labels: Number of unique labels labels: dict of label frequencies """ num_samples: int number_of_characters: int + + min_text_length: int average_text_length: float + max_text_length: int + unique_texts: int + + min_labels_per_text: int average_labels_per_text: float + max_labels_per_text: int + unique_labels: int labels: dict[str, dict[str, int]] @@ -96,7 +111,11 @@ def _calculate_metrics_from_split( sentences = self.dataset[split]["sentences"] labels = self.dataset[split]["labels"] - total_text_len = sum([len(t) for t in sentences]) + text_len = [len(t) for t in sentences] + all_sentences = [] + for s in sentences: + all_sentences.extend(s) + total_text_len = sum(text_len) total_labels = [] for label in labels: if isinstance(label, list): @@ -107,8 +126,13 @@ def _calculate_metrics_from_split( return ClusteringDescriptiveStatistics( num_samples=len(sentences), number_of_characters=total_text_len, + min_text_length=min(text_len), average_text_length=total_text_len / len(sentences), + max_text_length=max(text_len), + unique_texts=len(set(all_sentences)), + min_labels_per_text=min(label_counter.values()), average_labels_per_text=len(total_labels) / len(sentences), + max_labels_per_text=max(label_counter.values()), unique_labels=len(label_counter), labels={ str(label): { diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py index fedf392f7..40e36d29e 100644 --- a/mteb/abstasks/AbsTaskClusteringFast.py +++ b/mteb/abstasks/AbsTaskClusteringFast.py @@ -85,16 +85,30 @@ class ClusteringFastDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + + min_text_length: Minimum length of text average_text_length: Average length of text + max_text_length: Maximum length of text + unique_texts: Number of unique texts + + min_labels_per_text: Minimum number of labels per text average_labels_per_text: Average number of labels per text + max_labels_per_text: Maximum number of labels per text unique_labels: Number of unique labels labels: dict of label frequencies """ num_samples: int number_of_characters: int + + min_text_length: int average_text_length: float + max_text_length: int + unique_texts: int + + min_labels_per_text: int average_labels_per_text: float + max_labels_per_text: int unique_labels: int labels: dict[str, dict[str, int]] @@ -226,7 +240,8 @@ def _calculate_metrics_from_split( sentences = self.dataset[split]["sentences"] labels = self.dataset[split]["labels"] - total_text_len = sum([len(t) for t in sentences]) + text_len = [len(t) for t in sentences] + total_text_len = sum(text_len) total_labels = [] for label in labels: if isinstance(label, list): @@ -237,8 +252,12 @@ def _calculate_metrics_from_split( return ClusteringFastDescriptiveStatistics( num_samples=len(sentences), number_of_characters=total_text_len, + min_text_length=min(text_len), average_text_length=total_text_len / len(sentences), + max_text_length=max(text_len), + min_labels_per_text=min(label_counter.values()), average_labels_per_text=len(total_labels) / len(sentences), + max_labels_per_text=max(label_counter.values()), unique_labels=len(label_counter), labels={ str(label): { diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py index bdbe5cd6c..219426fe6 100644 --- a/mteb/abstasks/AbsTaskInstructionRetrieval.py +++ b/mteb/abstasks/AbsTaskInstructionRetrieval.py @@ -37,6 +37,7 @@ def __init__( qrels_file: str = "", streaming: bool = False, keep_in_memory: bool = False, + trust_remote_code: bool = False, ): self.corpus = {} self.queries = {} @@ -69,6 +70,7 @@ def __init__( self.qrels_file = qrels_file self.streaming = streaming self.keep_in_memory = keep_in_memory + self.trust_remote_code = trust_remote_code def load( self, split="test" @@ -227,24 +229,68 @@ class InstructionRetrievalDescriptiveStatistics(DescriptiveStatistics): num_queries: Number of queries num_docs: Number of documents number_of_characters: Total number of symbols in the dataset + + min_document_length: Minimum length of documents average_document_length: Average length of documents + max_document_length: Maximum length of documents + unique_docs: Number of unique documents + + min_query_length: Minimum length of queries average_query_length: Average length of queries + max_query_length: Maximum length of queries + unique_queries: Number of unique queries + + min_instruction_length: Minimum length of instructions average_instruction_length: Average length of instructions + max_instruction_length: Maximum length of instructions + unique_instructions: Number of unique instructions + + min_changed_instruction_length: Minimum length of changed instructions average_changed_instruction_length: Average length of changed instructions + max_changed_instruction_length: Maximum length of changed instructions + unique_changed_instructions: Number of unique changed instructions + + min_average_relevant_docs_per_query: Minimum number of relevant docs per query average_relevant_docs_per_query: Average number of relevant docs per query + max_average_relevant_docs_per_query: Maximum number of relevant docs per query + + min_average_top_ranked_per_query: Minimum number of top ranked docs per query average_top_ranked_per_query: Average number of top ranked docs per query + max_average_top_ranked_per_query: Maximum number of top ranked docs per query """ num_samples: int num_queries: int num_docs: int number_of_characters: int + + min_document_length: int average_document_length: float + max_document_length: int + unique_docs: int + + min_query_length: int average_query_length: float + max_query_length: int + unique_queries: int + + min_instruction_length: int average_instruction_length: float + max_instruction_length: int + unique_instructions: int + + min_changed_instruction_length: int average_changed_instruction_length: float + max_changed_instruction_length: int + unique_changed_instructions: int + + min_average_relevant_docs_per_query: float average_relevant_docs_per_query: float + max_average_relevant_docs_per_query: float + + min_average_top_ranked_per_query: float average_top_ranked_per_query: float + max_average_top_ranked_per_query: float class AbsTaskInstructionRetrieval(AbsTask): @@ -665,25 +711,31 @@ def _calculate_metrics_from_split( changed_instructions = self.changed_instructions[split] top_ranked = self.top_ranked[split] - total_corpus_len = sum( - [len(doc.get("title", "")) + len(doc["text"]) for doc in corpus.values()] - ) - total_queries_len = sum([len(query) for query in queries.values()]) - total_instructions_len = sum( - [len(instruction) for instruction in og_instructions.values()] - ) - total_changed_instructions_len = sum( - [len(instruction) for instruction in changed_instructions.values()] - ) - num_qrels_non_zero = sum( + corpus_combined = [ + doc.get("title", "") + doc["text"] for doc in corpus.values() + ] + corpus_len = [len(doc) for doc in corpus_combined] + total_corpus_len = sum(corpus_len) + + queries_len = [len(query) for query in queries.values()] + total_queries_len = sum(queries_len) + instructions_len = [ + len(instruction) for instruction in og_instructions.values() + ] + total_instructions_len = sum(instructions_len) + changed_instructions_len = [ + len(instruction) for instruction in changed_instructions.values() + ] + total_changed_instructions_len = sum(changed_instructions_len) + qrels_non_zero = [ sum(1 for doc_id in docs if docs[doc_id] != 0) for docs in relevant_docs.values() - ) + ] + num_qrels_non_zero = sum(qrels_non_zero) qrels_per_doc = num_qrels_non_zero / len(relevant_docs) if len(queries) else 0 + ranked_per_query = [len(docs) for docs in top_ranked.values()] top_ranked_per_query = ( - sum(len(docs) for docs in top_ranked.values()) / len(queries) - if len(queries) - else 0 + sum(ranked_per_query) / len(queries) if len(queries) else 0 ) return InstructionRetrievalDescriptiveStatistics( num_samples=len(queries) + len(corpus), @@ -693,20 +745,36 @@ def _calculate_metrics_from_split( + total_queries_len + total_instructions_len + total_changed_instructions_len, + min_document_length=min(corpus_len), average_document_length=( total_corpus_len / len(corpus) if len(corpus) else 0 ), + max_document_length=max(corpus_len), + unique_docs=len(set(corpus_combined)), + min_query_length=min(queries_len), average_query_length=( total_queries_len / len(queries) if len(queries) else 0 ), + max_query_length=max(queries_len), + unique_queries=len(set(queries.values())), + min_instruction_length=min(instructions_len), average_instruction_length=( total_instructions_len / len(queries) if len(queries) else 0 ), + max_instruction_length=max(instructions_len), + unique_instructions=len(set(og_instructions.values())), + min_changed_instruction_length=min(changed_instructions_len), average_changed_instruction_length=( total_changed_instructions_len / len(queries) if len(queries) else 0 ), + max_changed_instruction_length=max(changed_instructions_len), + unique_changed_instructions=len(set(changed_instructions.values())), + min_average_relevant_docs_per_query=min(qrels_non_zero), average_relevant_docs_per_query=qrels_per_doc, + max_average_relevant_docs_per_query=max(qrels_non_zero), + min_average_top_ranked_per_query=min(ranked_per_query), average_top_ranked_per_query=top_ranked_per_query, + max_average_top_ranked_per_query=max(ranked_per_query), ) diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index 6fd3acf90..38d3722ff 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -47,16 +47,32 @@ class MultilabelClassificationDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + number_texts_in_train: Number of texts in the train split + + min_text_length: Minimum length of text average_text_length: Average length of text + max_text_length: Maximum length of text + unique_texts: Number of unique texts + + min_labels_per_text: Minimum number of labels per text average_label_per_text: Average number of labels per text + max_labels_per_text: Maximum number of labels per text unique_labels: Number of unique labels labels: dict of label frequencies """ num_samples: int number_of_characters: int + number_texts_in_train: int | None + + min_text_length: int average_text_length: float + max_text_length: int + unique_texts: int + + min_labels_per_text: int average_label_per_text: float + max_labels_per_text: int unique_labels: int labels: dict[str, dict[str, int]] @@ -231,30 +247,48 @@ def _undersample_data_indices(self, y, samples_per_label, idxs=None): def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> MultilabelClassificationDescriptiveStatistics: + train_text = [] if hf_subset: text = self.dataset[hf_subset][split]["text"] label = self.dataset[hf_subset][split]["label"] + if split != "train": + train_text = self.dataset[hf_subset]["train"]["text"] elif compute_overall: text = [] label = [] for hf_subset in self.metadata.eval_langs: text.extend(self.dataset[hf_subset][split]["text"]) label.extend(self.dataset[hf_subset][split]["label"]) + if split != "train": + train_text.extend(self.dataset[hf_subset]["train"]["text"]) else: text = self.dataset[split]["text"] label = self.dataset[split]["label"] + if split != "train": + train_text = self.dataset["train"]["text"] - total_text_len = sum(len(t) for t in text) - total_label_len = sum(len(l) for l in label) + text_len = [len(t) for t in text] + total_text_len = sum(text_len) + label_len = [len(l) for l in label] + total_label_len = sum(label_len) total_labels = [] for l in label: total_labels.extend(l if len(l) > 0 else [None]) label_count = Counter(total_labels) + num_texts_in_train = ( + len(set(text) & set(train_text)) if split != "train" else None + ) return MultilabelClassificationDescriptiveStatistics( - average_text_length=total_text_len / len(text), + num_samples=len(text), number_of_characters=total_text_len, + number_texts_in_train=num_texts_in_train, + min_text_length=min(text_len), + average_text_length=total_text_len / len(text), + max_text_length=max(text_len), + unique_texts=len(set(text)), + min_labels_per_text=min(label_len), average_label_per_text=total_label_len / len(label), - num_samples=len(text), + max_labels_per_text=max(label_len), unique_labels=len(label_count), labels={ str(label): { diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py index 0cbdafda8..82ba128c2 100644 --- a/mteb/abstasks/AbsTaskPairClassification.py +++ b/mteb/abstasks/AbsTaskPairClassification.py @@ -20,16 +20,34 @@ class PairClassificationDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. - avg_sentence1_len: Average length of sentence1 - avg_sentence2_len: Average length of sentence2 + + min_sentence1_length: Minimum length of sentence1 + avg_sentence1_length: Average length of sentence1 + max_sentence1_length: Maximum length of sentence1 + unique_sentence1: Number of unique sentence + + min_sentence2_length: Minimum length of sentence2 + avg_sentence2_length: Average length of sentence2 + max_sentence2_length: Maximum length of sentence2 + unique_sentence2: Number of unique sentence + unique_labels: Number of unique labels labels: dict of label frequencies """ num_samples: int number_of_characters: int - avg_sentence1_len: float - avg_sentence2_len: float + + min_sentence1_length: int + avg_sentence1_length: float + max_sentence1_length: int + unique_sentence1: int + + min_sentence2_length: int + avg_sentence2_length: float + max_sentence2_length: int + unique_sentence2: int + unique_labels: int labels: dict[str, dict[str, int]] @@ -109,14 +127,22 @@ def _calculate_metrics_from_split( dataset["labels"][0] if len(dataset["labels"]) == 1 else dataset["labels"] ) - total_sentence1_len = sum([len(sentence) for sentence in sentence1]) - total_sentence2_len = sum([len(sentence) for sentence in sentence2]) + sentence1_len = [len(sentence) for sentence in sentence1] + total_sentence1_len = sum(sentence1_len) + sentence2_len = [len(sentence) for sentence in sentence2] + total_sentence2_len = sum(sentence2_len) label_count = Counter(labels) return PairClassificationDescriptiveStatistics( num_samples=len(sentence1), number_of_characters=total_sentence1_len + total_sentence2_len, - avg_sentence1_len=total_sentence1_len / len(sentence1), - avg_sentence2_len=total_sentence2_len / len(sentence2), + min_sentence1_length=min(sentence1_len), + avg_sentence1_length=total_sentence1_len / len(sentence1), + max_sentence1_length=max(sentence1_len), + unique_sentence1=len(set(sentence1)), + min_sentence2_length=min(sentence2_len), + avg_sentence2_length=total_sentence2_len / len(sentence2), + max_sentence2_length=max(sentence2_len), + unique_sentence2=len(set(sentence2)), unique_labels=len(set(labels)), labels={ str(label): {"count": count} for label, count in label_count.items() diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 3703b5a3c..ab00a53a3 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -20,18 +20,42 @@ class RerankingDescriptiveStatistics(DescriptiveStatistics): number_of_characters: Total number of symbols in the dataset. num_positive: Number of positive examples num_negative: Number of negative examples - avg_query_len: Average length of queries - avg_positive_len: Average length of positive examples - avg_negative_len: Average length of negative examples + + min_query_length: Minimum length of queries + avg_query_length: Average length of queries + max_query_length: Maximum length of queries + unique_query: Number of unique queries + + min_positive_length: Minimum length of positive examples + avg_positive_length: Average length of positive examples + max_positive_length: Maximum length of positive examples + unique_positive: Number of unique positive examples + + min_negative_length: Minimum length of negative examples + avg_negative_length: Average length of negative examples + max_negative_length: Maximum length of negative examples + unique_negative: Number of unique negative examples """ num_samples: int number_of_characters: int num_positive: int num_negative: int - avg_query_len: float - avg_positive_len: float - avg_negative_len: float + + min_query_length: int + avg_query_length: float + max_query_length: int + unique_query: int + + min_positive_length: int + avg_positive_length: float + max_positive_length: int + unique_positive: int + + min_negative_length: int + avg_negative_length: float + max_negative_length: int + unique_negative: int class AbsTaskReranking(AbsTask): @@ -98,9 +122,12 @@ def _calculate_metrics_from_split( positive = transform_reranking_data(self.dataset[split]["positive"]) negative = transform_reranking_data(self.dataset[split]["negative"]) - total_len_query = sum([len(q) for q in query]) - total_len_positive = sum([len(p) for p in positive]) - total_len_negative = sum([len(n) for n in negative]) + len_query = [len(q) for q in query] + total_len_query = sum(len_query) + len_positive = [len(p) for p in positive] + total_len_positive = sum(len_positive) + len_negative = [len(n) for n in negative] + total_len_negative = sum(len_negative) return RerankingDescriptiveStatistics( num_samples=len(query), number_of_characters=total_len_query @@ -108,9 +135,18 @@ def _calculate_metrics_from_split( + total_len_negative, num_positive=len(positive), num_negative=len(negative), - avg_query_len=total_len_query / len(query), - avg_positive_len=total_len_positive / len(positive), - avg_negative_len=total_len_negative / len(negative), + min_query_length=min(len_query), + avg_query_length=total_len_query / len(query), + max_query_length=max(len_query), + unique_query=len(set(query)), + min_positive_length=min(len_positive), + avg_positive_length=total_len_positive / len(positive), + max_positive_length=max(len_positive), + unique_positive=len(set(positive)), + min_negative_length=min(len_negative), + avg_negative_length=total_len_negative / len(negative), + max_negative_length=max(len_negative), + unique_negative=len(set(negative)), ) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 078979b6b..95746e1a2 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -206,18 +206,42 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): num_queries: number of queries in the dataset num_documents: Number of documents number_of_characters: Total number of symbols in the dataset + + min_document_length: Minimum length of documents average_document_length: Average length of documents + max_document_length: Maximum length of documents + unique_documents: Number of unique documents + + min_query_length: Minimum length of queries average_query_length: Average length of queries + max_query_length: Maximum length of queries + unique_queries: Number of unique queries + + min_relevant_docs_per_query: Minimum number of relevant documents per query average_relevant_docs_per_query: Average number of relevant documents per query + max_relevant_docs_per_query: Maximum number of relevant documents per query + unique_relevant_docs: Number of unique relevant documents """ num_samples: int num_queries: int num_documents: int number_of_characters: int + + min_document_length: int average_document_length: float + max_document_length: int + unique_documents: int + + min_query_length: int average_query_length: float + max_query_length: int + unique_queries: int + + min_relevant_docs_per_query: int average_relevant_docs_per_query: float + max_relevant_docs_per_query: int + unique_relevant_docs: int class AbsTaskRetrieval(AbsTask): @@ -436,26 +460,36 @@ def _calculate_metrics_from_split( num_documents = len(corpus) num_queries = len(queries) - # number of qrels that are not 0 - num_qrels_non_zero = sum( - sum(1 for doc_id in docs if docs[doc_id] != 0) - for docs in relevant_docs.values() - ) - qrels_per_doc = num_qrels_non_zero / len(relevant_docs) if num_queries else 0 + # create a list of number of relevant docs per query + qrels_lengths = [ + len(relevant_docs[qid]) for qid in relevant_docs if qid in queries + ] + num_qrels = sum(qrels_lengths) + qrels_per_doc = num_qrels / len(relevant_docs) if num_queries else 0 + unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]}) return RetrievalDescriptiveStatistics( - number_of_characters=query_len + doc_len, + number_of_characters=sum(query_len) + sum(doc_len), num_samples=num_documents + num_queries, num_queries=num_queries, num_documents=num_documents, - average_document_length=doc_len / num_documents, - average_query_length=query_len / num_queries, + min_document_length=min(doc_len), + average_document_length=sum(doc_len) / num_documents, + max_document_length=max(doc_len), + unique_documents=len(set(corpus)), + min_query_length=min(query_len), + average_query_length=sum(query_len) / num_queries, + max_query_length=max(query_len), + unique_queries=len(set(queries)), + min_relevant_docs_per_query=min(qrels_lengths), average_relevant_docs_per_query=qrels_per_doc, + max_relevant_docs_per_query=max(qrels_lengths), + unique_relevant_docs=unique_qrels, ) def calculate_length( queries: dict[str, str], corpus: dict[str, str] -) -> tuple[int, int]: +) -> tuple[list[int], list[int]]: queries_lens = [] doc_lens = [] for query in queries.values(): @@ -467,9 +501,7 @@ def calculate_length( for doc in corpus.values(): doc_lens.append(len(doc)) - doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0 - query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0 - return query_len, doc_len + return doc_lens, queries_lens def process_docs( diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py index c9fa896b6..d12b88545 100644 --- a/mteb/abstasks/AbsTaskSTS.py +++ b/mteb/abstasks/AbsTaskSTS.py @@ -17,16 +17,36 @@ class STSDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. + + min_sentence1_length: Minimum length of sentence1 average_sentence1_len: Average length of sentence1 + max_sentence1_length: Maximum length of sentence1 + + min_sentence2_length: Minimum length of sentence2 average_sentence2_len: Average length of sentence2 + max_sentence2_length: Maximum length of sentence2 + + min_score: Minimum score avg_score: Average score + max_score: Maximum score """ num_samples: int number_of_characters: int + + min_sentence1_length: int average_sentence1_len: float + max_sentence1_length: int + unique_sentence1: int + + min_sentence2_length: int average_sentence2_len: float + max_sentence2_length: int + unique_sentence2: int + + min_score: float avg_score: float + max_score: float class AbsTaskSTS(AbsTask): @@ -93,13 +113,23 @@ def _calculate_metrics_from_split( sentence2 = self.dataset[split]["sentence2"] score = self.dataset[split]["score"] - total_sentence1_len = sum([len(s) for s in sentence1]) - total_sentence2_len = sum([len(s) for s in sentence2]) + sentence1_len = [len(s) for s in sentence1] + sentence2_len = [len(s) for s in sentence2] + total_sentence1_len = sum(sentence1_len) + total_sentence2_len = sum(sentence2_len) avg_score = sum(score) / len(score) return STSDescriptiveStatistics( num_samples=len(sentence1), number_of_characters=total_sentence1_len + total_sentence2_len, + min_sentence1_length=min(sentence1_len), average_sentence1_len=total_sentence1_len / len(sentence1), + max_sentence1_length=max(sentence1_len), + unique_sentence1=len(set(sentence1)), + min_sentence2_length=min(sentence2_len), average_sentence2_len=total_sentence2_len / len(sentence2), + max_sentence2_length=max(sentence2_len), + unique_sentence2=len(set(sentence2)), + min_score=min(score), avg_score=avg_score, + max_score=max(score), ) diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py index 6d792c319..07fd42057 100644 --- a/mteb/abstasks/AbsTaskSummarization.py +++ b/mteb/abstasks/AbsTaskSummarization.py @@ -21,18 +21,48 @@ class SummarizationDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset. number_of_characters: Total number of symbols in the dataset. - avg_text_len: Average length of text - avg_human_summaries_len: Average length of human summaries - avg_machine_summaries_len: Average length of machine summaries + + min_text_length: Minimum length of text + avg_text_length: Average length of text + max_text_length: Maximum length of text + unique_texts: Number of unique texts + + min_human_summaries_length: Minimum length of human summaries + avg_human_summaries_length: Average length of human summaries + max_human_summaries_length: Maximum length of human summaries + unique_human_summaries: Number of unique human summaries + + min_machine_summaries_length: Minimum length of machine summaries + avg_machine_summaries_length: Average length of machine summaries + max_machine_summaries_length: Maximum length of machine summaries + unique_machine_summaries: Number of unique machine summaries + + min_relevance: Minimum relevance score avg_relevance: Average relevance score + max_relevance: Maximum relevance score """ num_samples: int number_of_characters: int - avg_text_len: float - avg_human_summaries_len: float - avg_machine_summaries_len: float + + min_text_length: int + avg_text_length: float + max_text_length: int + unique_texts: int + + min_human_summaries_length: int + avg_human_summaries_length: float + max_human_summaries_length: int + unique_human_summaries: int + + min_machine_summaries_length: int + avg_machine_summaries_length: float + max_machine_summaries_length: int + unique_machine_summaries: int + + min_relevance: float avg_relevance: float + max_relevance: float class AbsTaskSummarization(AbsTask): @@ -112,17 +142,39 @@ def _calculate_metrics_from_split( machine_summaries = self.dataset[split]["machine_summaries"] relevance = self.dataset[split]["relevance"] - total_text_len = sum(len(x) for x in text) - total_human_summaries_len = sum(len(x) for x in human_summaries) - total_machine_summaries_len = sum(len(x) for x in machine_summaries) + all_human_summaries = [] + for s in human_summaries: + all_human_summaries.extend(s) + + all_machine_summaries = [] + for s in machine_summaries: + all_machine_summaries.extend(s) + + text_len = [len(t) for t in text] + total_text_len = sum(text_len) + human_summaries_len = [len(s) for s in human_summaries] + total_human_summaries_len = sum(human_summaries_len) + machine_summaries_len = [len(s) for s in machine_summaries] + total_machine_summaries_len = sum(machine_summaries_len) total_relevance = sum(sum(x) / len(x) for x in relevance) return SummarizationDescriptiveStatistics( num_samples=len(text), number_of_characters=total_text_len + total_human_summaries_len + total_machine_summaries_len, - avg_text_len=total_text_len / len(text), - avg_human_summaries_len=total_human_summaries_len / len(text), - avg_machine_summaries_len=total_machine_summaries_len / len(text), + min_text_length=min(text_len), + avg_text_length=total_text_len / len(text), + max_text_length=max(text_len), + unique_texts=len(set(text)), + min_human_summaries_length=min(human_summaries_len), + avg_human_summaries_length=total_human_summaries_len / len(text), + max_human_summaries_length=max(human_summaries_len), + unique_human_summaries=len(set(all_human_summaries)), + min_machine_summaries_length=min(machine_summaries_len), + avg_machine_summaries_length=total_machine_summaries_len / len(text), + max_machine_summaries_length=max(machine_summaries_len), + unique_machine_summaries=len(set(all_machine_summaries)), + min_relevance=min(relevance), avg_relevance=total_relevance / len(relevance), + max_relevance=max(relevance), ) diff --git a/mteb/descriptive_stats/BitextMining/BUCC.v2.json b/mteb/descriptive_stats/BitextMining/BUCC.v2.json new file mode 100644 index 000000000..75ef75ced --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/BUCC.v2.json @@ -0,0 +1,69 @@ +{ + "test": { + "num_samples": 35000, + "number_of_characters": 6640032, + "unique_pairs": 34978, + "min_sentence1_length": 16, + "average_sentence1_length": 99.10931428571429, + "max_sentence1_length": 204, + "unique_sentence1": 34978, + "min_sentence2_length": 42, + "average_sentence2_length": 90.60588571428572, + "max_sentence2_length": 159, + "unique_sentence2": 25306, + "hf_subset_descriptive_stats": { + "de-en": { + "num_samples": 9580, + "number_of_characters": 1919197, + "unique_pairs": 9573, + "min_sentence1_length": 50, + "average_sentence1_length": 109.07974947807934, + "max_sentence1_length": 204, + "unique_sentence1": 9573, + "min_sentence2_length": 46, + "average_sentence2_length": 91.25396659707724, + "max_sentence2_length": 155, + "unique_sentence2": 9570 + }, + "fr-en": { + "num_samples": 9086, + "number_of_characters": 1677545, + "unique_pairs": 9081, + "min_sentence1_length": 43, + "average_sentence1_length": 99.31785163988553, + "max_sentence1_length": 174, + "unique_sentence1": 9081, + "min_sentence2_length": 42, + "average_sentence2_length": 85.3117983711204, + "max_sentence2_length": 159, + "unique_sentence2": 9076 + }, + "ru-en": { + "num_samples": 14435, + "number_of_characters": 2808206, + "unique_pairs": 14425, + "min_sentence1_length": 40, + "average_sentence1_length": 101.6593003117423, + "max_sentence1_length": 186, + "unique_sentence1": 14425, + "min_sentence2_length": 45, + "average_sentence2_length": 92.88216141323173, + "max_sentence2_length": 159, + "unique_sentence2": 14424 + }, + "zh-en": { + "num_samples": 1899, + "number_of_characters": 235084, + "unique_pairs": 1899, + "min_sentence1_length": 16, + "average_sentence1_length": 28.429699842022117, + "max_sentence1_length": 40, + "unique_sentence1": 1899, + "min_sentence2_length": 48, + "average_sentence2_length": 95.3638757240653, + "max_sentence2_length": 159, + "unique_sentence2": 1899 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/BornholmBitextMining.json b/mteb/descriptive_stats/BitextMining/BornholmBitextMining.json index 131c9966a..0675e5e0e 100644 --- a/mteb/descriptive_stats/BitextMining/BornholmBitextMining.json +++ b/mteb/descriptive_stats/BitextMining/BornholmBitextMining.json @@ -1,8 +1,15 @@ { "test": { + "num_samples": 500, + "number_of_characters": 44361, + "unique_pairs": 500, + "min_sentence1_length": 1, "average_sentence1_length": 49.834, + "max_sentence1_length": 555, + "unique_sentence1": 497, + "min_sentence2_length": 5, "average_sentence2_length": 38.888, - "num_samples": 500, - "number_of_characters": 44361 + "max_sentence2_length": 453, + "unique_sentence2": 491 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/IN22ConvBitextMining.json b/mteb/descriptive_stats/BitextMining/IN22ConvBitextMining.json index 507d9ad7b..effafd237 100644 --- a/mteb/descriptive_stats/BitextMining/IN22ConvBitextMining.json +++ b/mteb/descriptive_stats/BitextMining/IN22ConvBitextMining.json @@ -1,3045 +1,6594 @@ { "test": { - "average_sentence1_length": 54.32948595562498, - "average_sentence2_length": 54.32948595562498, "num_samples": 760518, "number_of_characters": 82637104, + "unique_pairs": 759283, + "min_sentence1_length": 3, + "average_sentence1_length": 54.32948595562498, + "max_sentence1_length": 239, + "unique_sentence1": 34430, + "min_sentence2_length": 3, + "average_sentence2_length": 54.32948595562498, + "max_sentence2_length": 239, + "unique_sentence2": 34430, "hf_subset_descriptive_stats": { "asm_Beng-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 155988, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 155988 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "asm_Beng-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 162044, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 162044 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "asm_Beng-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 167032, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 167032 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "asm_Beng-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 160716, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 160716 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "asm_Beng-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 156282, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 156282 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "asm_Beng-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 158269, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 158269 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "asm_Beng-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 159964, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 159964 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "asm_Beng-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 165177, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 165177 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "asm_Beng-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 164681, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 164681 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "asm_Beng-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 162408, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 162408 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "asm_Beng-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 172838, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 172838 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "asm_Beng-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 162747, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 162747 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "asm_Beng-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 157316, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 157316 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "asm_Beng-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 160906, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 160906 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "asm_Beng-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 164223, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 164223 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "asm_Beng-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 160201, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 160201 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "asm_Beng-san_Deva": { + "num_samples": 1503, + "number_of_characters": 158093, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 158093 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "asm_Beng-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 169379, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 169379 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "asm_Beng-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 162623, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 162623 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "asm_Beng-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 174866, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 174866 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "asm_Beng-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 157690, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 157690 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "asm_Beng-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 161305, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 53.753825681969396, + "max_sentence1_length": 208, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 161305 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "ben_Beng-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 155988, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 155988 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "ben_Beng-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 156448, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 156448 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "ben_Beng-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 161436, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 161436 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "ben_Beng-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 155120, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 155120 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "ben_Beng-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 150686, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 150686 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "ben_Beng-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 152673, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 152673 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "ben_Beng-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 154368, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 154368 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "ben_Beng-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 159581, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 159581 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "ben_Beng-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 159085, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 159085 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "ben_Beng-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 156812, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 156812 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "ben_Beng-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 167242, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 167242 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "ben_Beng-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 157151, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 157151 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "ben_Beng-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 151720, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 151720 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "ben_Beng-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 155310, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 155310 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "ben_Beng-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 158627, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 158627 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "ben_Beng-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 154605, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 154605 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "ben_Beng-san_Deva": { + "num_samples": 1503, + "number_of_characters": 152497, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 152497 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "ben_Beng-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 163783, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 163783 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "ben_Beng-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 157027, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 157027 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "ben_Beng-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 169270, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 169270 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "ben_Beng-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 152094, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 152094 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "ben_Beng-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 155709, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.03060545575516, + "max_sentence1_length": 178, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 155709 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "brx_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 162044, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 162044 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "brx_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 156448, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 156448 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "brx_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 167492, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 167492 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "brx_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 161176, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 161176 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "brx_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 156742, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 156742 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "brx_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 158729, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 158729 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "brx_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 160424, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 160424 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "brx_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 165637, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 165637 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "brx_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 165141, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 165141 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "brx_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 162868, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 162868 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "brx_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 173298, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 173298 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "brx_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 163207, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 163207 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "brx_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 157776, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 157776 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "brx_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 161366, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 161366 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "brx_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 164683, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 164683 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "brx_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 160661, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 160661 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "brx_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 158553, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 158553 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "brx_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 169839, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 169839 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "brx_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 163083, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 163083 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "brx_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 175326, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 175326 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "brx_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 158150, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 158150 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "brx_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 161765, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.05988023952096, + "max_sentence1_length": 210, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 161765 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "doi_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 167032, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 167032 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "doi_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 161436, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 161436 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "doi_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 167492, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 167492 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "doi_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 166164, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 166164 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "doi_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 161730, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 161730 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "doi_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 163717, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 163717 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "doi_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 165412, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 165412 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "doi_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 170625, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 170625 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "doi_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 170129, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 170129 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "doi_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 167856, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 167856 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "doi_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 178286, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 178286 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "doi_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 168195, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 168195 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "doi_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 162764, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 162764 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "doi_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 166354, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 166354 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "doi_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 169671, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 169671 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "doi_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 165649, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 165649 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "doi_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 163541, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 163541 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "doi_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 174827, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 174827 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "doi_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 168071, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 168071 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "doi_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 180314, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 180314 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "doi_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 163138, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 163138 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "doi_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 166753, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 57.37857618097139, + "max_sentence1_length": 209, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 166753 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "eng_Latn-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 160716, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 160716 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "eng_Latn-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 155120, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 155120 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "eng_Latn-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 161176, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 161176 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "eng_Latn-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 166164, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 166164 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "eng_Latn-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 155414, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 155414 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "eng_Latn-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 157401, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 157401 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "eng_Latn-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 159096, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 159096 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "eng_Latn-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 164309, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 164309 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "eng_Latn-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 163813, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 163813 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "eng_Latn-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 161540, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 161540 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "eng_Latn-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 171970, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 171970 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "eng_Latn-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 161879, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 161879 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "eng_Latn-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 156448, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 156448 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "eng_Latn-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 160038, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 160038 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "eng_Latn-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 163355, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 163355 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "eng_Latn-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 159333, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 159333 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "eng_Latn-san_Deva": { + "num_samples": 1503, + "number_of_characters": 157225, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 157225 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "eng_Latn-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 168511, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 168511 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "eng_Latn-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 161755, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 161755 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "eng_Latn-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 173998, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 173998 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "eng_Latn-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 156822, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 156822 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "eng_Latn-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 160437, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.17631403858949, + "max_sentence1_length": 201, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 160437 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "gom_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 156282, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 156282 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "gom_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 150686, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 150686 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "gom_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 156742, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 156742 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "gom_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 161730, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 161730 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "gom_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 155414, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 155414 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "gom_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 152967, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 152967 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "gom_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 154662, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 154662 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "gom_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 159875, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 159875 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "gom_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 159379, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 159379 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "gom_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 157106, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 157106 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "gom_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 167536, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 167536 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "gom_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 157445, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 157445 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "gom_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 152014, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 152014 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "gom_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 155604, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 155604 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "gom_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 158921, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 158921 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "gom_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 154899, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 154899 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "gom_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 152791, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 152791 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "gom_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 164077, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 164077 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "gom_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 157321, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 157321 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "gom_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 169564, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 169564 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "gom_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 152388, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 152388 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "gom_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 156003, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 50.22621423819029, + "max_sentence1_length": 203, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 156003 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "guj_Gujr-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 158269, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 158269 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "guj_Gujr-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 152673, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 152673 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "guj_Gujr-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 158729, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 158729 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "guj_Gujr-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 163717, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 163717 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "guj_Gujr-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 157401, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 157401 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "guj_Gujr-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 152967, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 152967 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "guj_Gujr-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 156649, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 156649 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "guj_Gujr-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 161862, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 161862 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "guj_Gujr-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 161366, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 161366 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "guj_Gujr-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 159093, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 159093 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "guj_Gujr-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 169523, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 169523 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "guj_Gujr-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 159432, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 159432 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "guj_Gujr-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 154001, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 154001 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "guj_Gujr-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 157591, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 157591 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "guj_Gujr-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 160908, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 160908 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "guj_Gujr-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 156886, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 156886 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "guj_Gujr-san_Deva": { + "num_samples": 1503, + "number_of_characters": 154778, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 154778 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "guj_Gujr-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 166064, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 166064 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "guj_Gujr-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 159308, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 159308 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "guj_Gujr-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 171551, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 171551 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "guj_Gujr-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 154375, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 154375 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "guj_Gujr-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 157990, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 51.54823685961411, + "max_sentence1_length": 205, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 157990 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "hin_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 159964, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 159964 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "hin_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 154368, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 154368 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "hin_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 160424, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 160424 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "hin_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 165412, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 165412 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "hin_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 159096, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 159096 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "hin_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 154662, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 154662 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "hin_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 156649, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 156649 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "hin_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 163557, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 163557 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "hin_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 163061, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 163061 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "hin_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 160788, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 160788 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "hin_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 171218, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 171218 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "hin_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 161127, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 161127 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "hin_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 155696, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 155696 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "hin_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 159286, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 159286 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "hin_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 162603, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 162603 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "hin_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 158581, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 158581 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "hin_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 156473, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 156473 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "hin_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 167759, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 167759 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "hin_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 161003, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 161003 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "hin_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 173246, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 173246 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "hin_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 156070, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 156070 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "hin_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 159685, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.67598137059215, + "max_sentence1_length": 192, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 159685 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "kan_Knda-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 165177, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 165177 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "kan_Knda-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 159581, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 159581 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "kan_Knda-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 165637, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 165637 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "kan_Knda-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 170625, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 170625 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "kan_Knda-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 164309, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 164309 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "kan_Knda-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 159875, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 159875 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "kan_Knda-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 161862, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 161862 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "kan_Knda-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 163557, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 163557 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "kan_Knda-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 168274, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 168274 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "kan_Knda-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 166001, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 166001 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "kan_Knda-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 176431, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 176431 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "kan_Knda-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 166340, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 166340 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "kan_Knda-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 160909, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 160909 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "kan_Knda-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 164499, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 164499 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "kan_Knda-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 167816, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 167816 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "kan_Knda-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 163794, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 163794 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "kan_Knda-san_Deva": { + "num_samples": 1503, + "number_of_characters": 161686, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 161686 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "kan_Knda-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 172972, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 172972 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "kan_Knda-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 166216, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 166216 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "kan_Knda-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 178459, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 178459 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "kan_Knda-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 161283, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 161283 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "kan_Knda-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 164898, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 56.14437791084497, + "max_sentence1_length": 201, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 164898 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "kas_Arab-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 164681, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 164681 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "kas_Arab-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 159085, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 159085 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "kas_Arab-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 165141, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 165141 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "kas_Arab-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 170129, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 170129 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "kas_Arab-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 163813, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 163813 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "kas_Arab-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 159379, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 159379 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "kas_Arab-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 161366, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 161366 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "kas_Arab-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 163061, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 163061 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "kas_Arab-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 168274, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 168274 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "kas_Arab-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 165505, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 165505 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "kas_Arab-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 175935, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 175935 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "kas_Arab-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 165844, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 165844 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "kas_Arab-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 160413, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 160413 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "kas_Arab-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 164003, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 164003 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "kas_Arab-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 167320, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 167320 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "kas_Arab-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 163298, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 163298 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "kas_Arab-san_Deva": { + "num_samples": 1503, + "number_of_characters": 161190, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 161190 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "kas_Arab-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 172476, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 172476 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "kas_Arab-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 165720, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 165720 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "kas_Arab-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 177963, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 177963 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "kas_Arab-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 160787, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 160787 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "kas_Arab-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 164402, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 55.81437125748503, + "max_sentence1_length": 203, + "unique_sentence1": 1502, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 164402 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "mai_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 162408, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 162408 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "mai_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 156812, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 156812 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "mai_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 162868, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 162868 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "mai_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 167856, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 167856 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "mai_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 161540, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 161540 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "mai_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 157106, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 157106 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "mai_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 159093, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 159093 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "mai_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 160788, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 160788 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "mai_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 166001, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 166001 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "mai_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 165505, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 165505 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "mai_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 173662, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 173662 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "mai_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 163571, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 163571 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "mai_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 158140, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 158140 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "mai_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 161730, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 161730 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "mai_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 165047, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 165047 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "mai_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 161025, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 161025 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "mai_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 158917, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 158917 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "mai_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 170203, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 170203 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "mai_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 163447, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 163447 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "mai_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 175690, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 175690 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "mai_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 158514, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 158514 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "mai_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 162129, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 54.3020625415835, + "max_sentence1_length": 230, + "unique_sentence1": 1499, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 162129 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "mal_Mlym-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 172838, + "unique_pairs": 1498, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 172838 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "mal_Mlym-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 167242, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 167242 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "mal_Mlym-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 173298, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 173298 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "mal_Mlym-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 178286, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 178286 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "mal_Mlym-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 171970, + "unique_pairs": 1499, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 171970 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "mal_Mlym-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 167536, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 167536 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "mal_Mlym-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 169523, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 169523 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "mal_Mlym-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 171218, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 171218 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "mal_Mlym-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 176431, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 176431 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "mal_Mlym-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 175935, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 175935 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "mal_Mlym-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 173662, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 173662 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "mal_Mlym-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 174001, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 174001 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "mal_Mlym-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 168570, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 168570 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "mal_Mlym-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 172160, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 172160 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "mal_Mlym-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 175477, + "unique_pairs": 1503, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 175477 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "mal_Mlym-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 171455, + "unique_pairs": 1498, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 171455 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "mal_Mlym-san_Deva": { + "num_samples": 1503, + "number_of_characters": 169347, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 169347 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "mal_Mlym-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 180633, + "unique_pairs": 1501, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 180633 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "mal_Mlym-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 173877, + "unique_pairs": 1499, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 173877 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "mal_Mlym-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 186120, + "unique_pairs": 1502, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 186120 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "mal_Mlym-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 168944, + "unique_pairs": 1500, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 168944 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "mal_Mlym-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 172559, + "unique_pairs": 1499, + "min_sentence1_length": 5, "average_sentence1_length": 61.24151696606786, + "max_sentence1_length": 219, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 172559 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "mar_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 162747, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 162747 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "mar_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 157151, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 157151 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "mar_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 163207, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 163207 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "mar_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 168195, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 168195 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "mar_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 161879, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 161879 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "mar_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 157445, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 157445 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "mar_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 159432, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 159432 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "mar_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 161127, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 161127 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "mar_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 166340, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 166340 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "mar_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 165844, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 165844 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "mar_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 163571, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 163571 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "mar_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 174001, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 174001 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "mar_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 158479, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 158479 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "mar_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 162069, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 162069 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "mar_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 165386, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 165386 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "mar_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 161364, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 161364 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "mar_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 159256, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 159256 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "mar_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 170542, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 170542 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "mar_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 163786, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 163786 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "mar_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 176029, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 176029 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "mar_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 158853, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 158853 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "mar_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 162468, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.52761144377911, + "max_sentence1_length": 221, + "unique_sentence1": 1501, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 162468 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "mni_Mtei-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 157316, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 157316 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "mni_Mtei-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 151720, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 151720 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "mni_Mtei-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 157776, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 157776 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "mni_Mtei-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 162764, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 162764 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "mni_Mtei-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 156448, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 156448 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "mni_Mtei-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 152014, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 152014 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "mni_Mtei-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 154001, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 154001 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "mni_Mtei-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 155696, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 155696 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "mni_Mtei-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 160909, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 160909 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "mni_Mtei-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 160413, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 160413 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "mni_Mtei-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 158140, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 158140 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "mni_Mtei-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 168570, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 168570 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "mni_Mtei-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 158479, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 158479 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "mni_Mtei-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 156638, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 156638 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "mni_Mtei-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 159955, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 159955 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "mni_Mtei-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 155933, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 155933 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "mni_Mtei-san_Deva": { + "num_samples": 1503, + "number_of_characters": 153825, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 153825 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "mni_Mtei-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 165111, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 165111 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "mni_Mtei-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 158355, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 158355 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "mni_Mtei-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 170598, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 170598 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "mni_Mtei-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 153422, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 153422 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "mni_Mtei-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 157037, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 50.91417165668663, + "max_sentence1_length": 239, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 157037 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "npi_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 160906, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 160906 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "npi_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 155310, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 155310 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "npi_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 161366, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 161366 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "npi_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 166354, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 166354 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "npi_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 160038, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 160038 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "npi_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 155604, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 155604 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "npi_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 157591, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 157591 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "npi_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 159286, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 159286 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "npi_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 164499, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 164499 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "npi_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 164003, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 164003 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "npi_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 161730, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 161730 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "npi_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 172160, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 172160 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "npi_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 162069, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 162069 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "npi_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 156638, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 156638 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "npi_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 163545, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 163545 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "npi_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 159523, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 159523 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "npi_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 157415, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 157415 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "npi_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 168701, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 168701 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "npi_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 161945, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 161945 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "npi_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 174188, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 174188 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "npi_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 157012, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 157012 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "npi_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 160627, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.30272787757818, + "max_sentence1_length": 223, + "unique_sentence1": 1497, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 160627 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "ory_Orya-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 164223, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 164223 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "ory_Orya-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 158627, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 158627 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "ory_Orya-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 164683, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 164683 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "ory_Orya-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 169671, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 169671 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "ory_Orya-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 163355, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 163355 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "ory_Orya-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 158921, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 158921 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "ory_Orya-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 160908, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 160908 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "ory_Orya-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 162603, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 162603 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "ory_Orya-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 167816, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 167816 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "ory_Orya-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 167320, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 167320 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "ory_Orya-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 165047, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 165047 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "ory_Orya-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 175477, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 175477 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "ory_Orya-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 165386, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 165386 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "ory_Orya-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 159955, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 159955 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "ory_Orya-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 163545, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 163545 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "ory_Orya-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 162840, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 162840 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "ory_Orya-san_Deva": { + "num_samples": 1503, + "number_of_characters": 160732, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 160732 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "ory_Orya-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 172018, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 172018 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "ory_Orya-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 165262, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 165262 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "ory_Orya-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 177505, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 177505 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "ory_Orya-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 160329, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 160329 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "ory_Orya-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 163944, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 55.509647371922824, + "max_sentence1_length": 195, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 163944 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "pan_Guru-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 160201, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 160201 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "pan_Guru-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 154605, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 154605 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "pan_Guru-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 160661, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 160661 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "pan_Guru-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 165649, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 165649 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "pan_Guru-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 159333, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 159333 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "pan_Guru-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 154899, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 154899 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "pan_Guru-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 156886, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 156886 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "pan_Guru-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 158581, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 158581 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "pan_Guru-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 163794, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 163794 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "pan_Guru-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 163298, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 163298 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "pan_Guru-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 161025, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 161025 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "pan_Guru-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 171455, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 171455 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "pan_Guru-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 161364, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 161364 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "pan_Guru-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 155933, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 155933 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "pan_Guru-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 159523, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 159523 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "pan_Guru-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 162840, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 162840 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "pan_Guru-san_Deva": { + "num_samples": 1503, + "number_of_characters": 156710, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 156710 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "pan_Guru-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 167996, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 167996 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "pan_Guru-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 161240, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 161240 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "pan_Guru-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 173483, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 173483 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "pan_Guru-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 156307, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 156307 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "pan_Guru-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 159922, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 52.83366600133067, + "max_sentence1_length": 221, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 159922 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "san_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 158093, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 158093 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "san_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 152497, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 152497 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "san_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 158553, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 158553 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "san_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 163541, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 163541 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "san_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 157225, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 157225 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "san_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 152791, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 152791 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "san_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 154778, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 154778 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "san_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 156473, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 156473 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "san_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 161686, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 161686 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "san_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 161190, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 161190 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "san_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 158917, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 158917 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "san_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 169347, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 169347 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "san_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 159256, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 159256 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "san_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 153825, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 153825 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "san_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 157415, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 157415 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "san_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 160732, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 160732 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "san_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 156710, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 156710 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "san_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 165888, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 165888 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "san_Deva-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 159132, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 159132 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "san_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 171375, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 171375 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "san_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 154199, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 154199 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "san_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 157814, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 51.4311377245509, + "max_sentence1_length": 181, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 157814 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "sat_Olck-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 169379, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 169379 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "sat_Olck-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 163783, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 163783 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "sat_Olck-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 169839, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 169839 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "sat_Olck-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 174827, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 174827 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "sat_Olck-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 168511, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 168511 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "sat_Olck-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 164077, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 164077 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "sat_Olck-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 166064, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 166064 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "sat_Olck-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 167759, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 167759 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "sat_Olck-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 172972, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 172972 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "sat_Olck-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 172476, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 172476 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "sat_Olck-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 170203, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 170203 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "sat_Olck-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 180633, + "unique_pairs": 1501, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 180633 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "sat_Olck-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 170542, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 170542 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "sat_Olck-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 165111, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 165111 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "sat_Olck-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 168701, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 168701 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "sat_Olck-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 172018, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 172018 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "sat_Olck-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 167996, + "unique_pairs": 1501, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 167996 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "sat_Olck-san_Deva": { + "num_samples": 1503, + "number_of_characters": 165888, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 165888 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "sat_Olck-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 170418, + "unique_pairs": 1501, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 170418 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "sat_Olck-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 182661, + "unique_pairs": 1503, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 182661 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "sat_Olck-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 165485, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 165485 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "sat_Olck-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 169100, + "unique_pairs": 1502, + "min_sentence1_length": 7, "average_sentence1_length": 58.94011976047904, + "max_sentence1_length": 225, + "unique_sentence1": 1500, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 169100 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "snd_Deva-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 162623, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 162623 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "snd_Deva-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 157027, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 157027 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "snd_Deva-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 163083, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 163083 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "snd_Deva-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 168071, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 168071 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "snd_Deva-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 161755, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 161755 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "snd_Deva-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 157321, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 157321 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "snd_Deva-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 159308, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 159308 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "snd_Deva-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 161003, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 161003 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "snd_Deva-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 166216, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 166216 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "snd_Deva-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 165720, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 165720 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "snd_Deva-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 163447, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 163447 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "snd_Deva-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 173877, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 173877 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "snd_Deva-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 163786, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 163786 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "snd_Deva-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 158355, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 158355 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "snd_Deva-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 161945, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 161945 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "snd_Deva-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 165262, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 165262 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "snd_Deva-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 161240, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 161240 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "snd_Deva-san_Deva": { + "num_samples": 1503, + "number_of_characters": 159132, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 159132 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "snd_Deva-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 170418, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 170418 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "snd_Deva-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 175905, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 175905 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "snd_Deva-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 158729, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 158729 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "snd_Deva-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 162344, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 54.445109780439125, + "max_sentence1_length": 195, + "unique_sentence1": 1490, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 162344 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "tam_Taml-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 174866, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 174866 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "tam_Taml-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 169270, + "unique_pairs": 1501, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 169270 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "tam_Taml-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 175326, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 175326 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "tam_Taml-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 180314, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 180314 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "tam_Taml-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 173998, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 173998 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "tam_Taml-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 169564, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 169564 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "tam_Taml-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 171551, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 171551 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "tam_Taml-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 173246, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 173246 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "tam_Taml-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 178459, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 178459 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "tam_Taml-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 177963, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 177963 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "tam_Taml-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 175690, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 175690 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "tam_Taml-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 186120, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 186120 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "tam_Taml-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 176029, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 176029 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "tam_Taml-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 170598, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 170598 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "tam_Taml-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 174188, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 174188 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "tam_Taml-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 177505, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 177505 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "tam_Taml-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 173483, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 173483 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "tam_Taml-san_Deva": { + "num_samples": 1503, + "number_of_characters": 171375, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 171375 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "tam_Taml-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 182661, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 182661 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "tam_Taml-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 175905, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 175905 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "tam_Taml-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 170972, + "unique_pairs": 1502, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 170972 + "max_sentence2_length": 182, + "unique_sentence2": 1495 }, "tam_Taml-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 174587, + "unique_pairs": 1503, + "min_sentence1_length": 3, "average_sentence1_length": 62.590818363273456, + "max_sentence1_length": 224, + "unique_sentence1": 1492, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 174587 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "tel_Telu-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 157690, + "unique_pairs": 1499, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 157690 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "tel_Telu-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 152094, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 152094 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "tel_Telu-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 158150, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 158150 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "tel_Telu-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 163138, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 163138 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "tel_Telu-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 156822, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 156822 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "tel_Telu-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 152388, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 152388 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "tel_Telu-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 154375, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 154375 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "tel_Telu-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 156070, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 156070 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "tel_Telu-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 161283, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 161283 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "tel_Telu-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 160787, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 160787 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "tel_Telu-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 158514, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 158514 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "tel_Telu-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 168944, + "unique_pairs": 1500, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 168944 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "tel_Telu-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 158853, + "unique_pairs": 1503, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 158853 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "tel_Telu-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 153422, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 153422 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "tel_Telu-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 157012, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 157012 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "tel_Telu-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 160329, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 160329 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "tel_Telu-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 156307, + "unique_pairs": 1499, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 156307 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "tel_Telu-san_Deva": { + "num_samples": 1503, + "number_of_characters": 154199, + "unique_pairs": 1501, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 154199 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "tel_Telu-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 165485, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 165485 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "tel_Telu-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 158729, + "unique_pairs": 1499, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 158729 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "tel_Telu-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 170972, + "unique_pairs": 1502, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 170972 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "tel_Telu-urd_Arab": { + "num_samples": 1503, + "number_of_characters": 157411, + "unique_pairs": 1499, + "min_sentence1_length": 6, "average_sentence1_length": 51.16300731869594, + "max_sentence1_length": 182, + "unique_sentence1": 1495, + "min_sentence2_length": 4, "average_sentence2_length": 53.568196939454424, - "num_samples": 1503, - "number_of_characters": 157411 + "max_sentence2_length": 206, + "unique_sentence2": 1498 }, "urd_Arab-asm_Beng": { + "num_samples": 1503, + "number_of_characters": 161305, + "unique_pairs": 1498, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.753825681969396, - "num_samples": 1503, - "number_of_characters": 161305 + "max_sentence2_length": 208, + "unique_sentence2": 1497 }, "urd_Arab-ben_Beng": { + "num_samples": 1503, + "number_of_characters": 155709, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 50.03060545575516, - "num_samples": 1503, - "number_of_characters": 155709 + "max_sentence2_length": 178, + "unique_sentence2": 1497 }, "urd_Arab-brx_Deva": { + "num_samples": 1503, + "number_of_characters": 161765, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 54.05988023952096, - "num_samples": 1503, - "number_of_characters": 161765 + "max_sentence2_length": 210, + "unique_sentence2": 1498 }, "urd_Arab-doi_Deva": { + "num_samples": 1503, + "number_of_characters": 166753, + "unique_pairs": 1500, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 57.37857618097139, - "num_samples": 1503, - "number_of_characters": 166753 + "max_sentence2_length": 209, + "unique_sentence2": 1499 }, "urd_Arab-eng_Latn": { + "num_samples": 1503, + "number_of_characters": 160437, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.17631403858949, - "num_samples": 1503, - "number_of_characters": 160437 + "max_sentence2_length": 201, + "unique_sentence2": 1497 }, "urd_Arab-gom_Deva": { + "num_samples": 1503, + "number_of_characters": 156003, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 50.22621423819029, - "num_samples": 1503, - "number_of_characters": 156003 + "max_sentence2_length": 203, + "unique_sentence2": 1500 }, "urd_Arab-guj_Gujr": { + "num_samples": 1503, + "number_of_characters": 157990, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 51.54823685961411, - "num_samples": 1503, - "number_of_characters": 157990 + "max_sentence2_length": 205, + "unique_sentence2": 1500 }, "urd_Arab-hin_Deva": { + "num_samples": 1503, + "number_of_characters": 159685, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.67598137059215, - "num_samples": 1503, - "number_of_characters": 159685 + "max_sentence2_length": 192, + "unique_sentence2": 1497 }, "urd_Arab-kan_Knda": { + "num_samples": 1503, + "number_of_characters": 164898, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 56.14437791084497, - "num_samples": 1503, - "number_of_characters": 164898 + "max_sentence2_length": 201, + "unique_sentence2": 1499 }, "urd_Arab-kas_Arab": { + "num_samples": 1503, + "number_of_characters": 164402, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 55.81437125748503, - "num_samples": 1503, - "number_of_characters": 164402 + "max_sentence2_length": 203, + "unique_sentence2": 1502 }, "urd_Arab-mai_Deva": { + "num_samples": 1503, + "number_of_characters": 162129, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 54.3020625415835, - "num_samples": 1503, - "number_of_characters": 162129 + "max_sentence2_length": 230, + "unique_sentence2": 1499 }, "urd_Arab-mal_Mlym": { + "num_samples": 1503, + "number_of_characters": 172559, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 5, "average_sentence2_length": 61.24151696606786, - "num_samples": 1503, - "number_of_characters": 172559 + "max_sentence2_length": 219, + "unique_sentence2": 1495 }, "urd_Arab-mar_Deva": { + "num_samples": 1503, + "number_of_characters": 162468, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.52761144377911, - "num_samples": 1503, - "number_of_characters": 162468 + "max_sentence2_length": 221, + "unique_sentence2": 1501 }, "urd_Arab-mni_Mtei": { + "num_samples": 1503, + "number_of_characters": 157037, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 50.91417165668663, - "num_samples": 1503, - "number_of_characters": 157037 + "max_sentence2_length": 239, + "unique_sentence2": 1498 }, "urd_Arab-npi_Deva": { + "num_samples": 1503, + "number_of_characters": 160627, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 53.30272787757818, - "num_samples": 1503, - "number_of_characters": 160627 + "max_sentence2_length": 223, + "unique_sentence2": 1497 }, "urd_Arab-ory_Orya": { + "num_samples": 1503, + "number_of_characters": 163944, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 55.509647371922824, - "num_samples": 1503, - "number_of_characters": 163944 + "max_sentence2_length": 195, + "unique_sentence2": 1500 }, "urd_Arab-pan_Guru": { + "num_samples": 1503, + "number_of_characters": 159922, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 52.83366600133067, - "num_samples": 1503, - "number_of_characters": 159922 + "max_sentence2_length": 221, + "unique_sentence2": 1495 }, "urd_Arab-san_Deva": { + "num_samples": 1503, + "number_of_characters": 157814, + "unique_pairs": 1501, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 51.4311377245509, - "num_samples": 1503, - "number_of_characters": 157814 + "max_sentence2_length": 181, + "unique_sentence2": 1500 }, "urd_Arab-sat_Olck": { + "num_samples": 1503, + "number_of_characters": 169100, + "unique_pairs": 1502, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 7, "average_sentence2_length": 58.94011976047904, - "num_samples": 1503, - "number_of_characters": 169100 + "max_sentence2_length": 225, + "unique_sentence2": 1500 }, "urd_Arab-snd_Deva": { + "num_samples": 1503, + "number_of_characters": 162344, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 4, "average_sentence2_length": 54.445109780439125, - "num_samples": 1503, - "number_of_characters": 162344 + "max_sentence2_length": 195, + "unique_sentence2": 1490 }, "urd_Arab-tam_Taml": { + "num_samples": 1503, + "number_of_characters": 174587, + "unique_pairs": 1503, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 3, "average_sentence2_length": 62.590818363273456, - "num_samples": 1503, - "number_of_characters": 174587 + "max_sentence2_length": 224, + "unique_sentence2": 1492 }, "urd_Arab-tel_Telu": { + "num_samples": 1503, + "number_of_characters": 157411, + "unique_pairs": 1499, + "min_sentence1_length": 4, "average_sentence1_length": 53.568196939454424, + "max_sentence1_length": 206, + "unique_sentence1": 1498, + "min_sentence2_length": 6, "average_sentence2_length": 51.16300731869594, - "num_samples": 1503, - "number_of_characters": 157411 + "max_sentence2_length": 182, + "unique_sentence2": 1495 } } } diff --git a/mteb/descriptive_stats/BitextMining/IN22GenBitextMining.json b/mteb/descriptive_stats/BitextMining/IN22GenBitextMining.json new file mode 100644 index 000000000..c53818c9c --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/IN22GenBitextMining.json @@ -0,0 +1,6595 @@ +{ + "test": { + "num_samples": 518144, + "number_of_characters": 162367876, + "unique_pairs": 518101, + "min_sentence1_length": 9, + "average_sentence1_length": 156.6821925951087, + "max_sentence1_length": 692, + "unique_sentence1": 23550, + "min_sentence2_length": 9, + "average_sentence2_length": 156.6821925951087, + "max_sentence2_length": 692, + "unique_sentence2": 23550, + "hf_subset_descriptive_stats": { + "asm_Beng-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 310622, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "asm_Beng-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 323609, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "asm_Beng-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 319020, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "asm_Beng-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 320098, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "asm_Beng-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 312594, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "asm_Beng-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 309440, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "asm_Beng-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 320106, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "asm_Beng-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 332064, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "asm_Beng-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 322764, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "asm_Beng-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 308682, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "asm_Beng-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 343636, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "asm_Beng-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 321784, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "asm_Beng-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 313134, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "asm_Beng-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 313419, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "asm_Beng-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 334226, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "asm_Beng-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 306863, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "asm_Beng-san_Deva": { + "num_samples": 1024, + "number_of_characters": 318079, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "asm_Beng-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 326732, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "asm_Beng-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 320421, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "asm_Beng-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 348346, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "asm_Beng-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 319045, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "asm_Beng-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 315134, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 156.6982421875, + "max_sentence1_length": 582, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "ben_Beng-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 310622, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "ben_Beng-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 313313, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "ben_Beng-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 308724, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "ben_Beng-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 309802, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "ben_Beng-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 302298, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "ben_Beng-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 299144, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "ben_Beng-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 309810, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "ben_Beng-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 321768, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "ben_Beng-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 312468, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "ben_Beng-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 298386, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "ben_Beng-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 333340, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "ben_Beng-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 311488, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "ben_Beng-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 302838, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "ben_Beng-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 303123, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "ben_Beng-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 323930, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "ben_Beng-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 296567, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "ben_Beng-san_Deva": { + "num_samples": 1024, + "number_of_characters": 307783, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "ben_Beng-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 316436, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "ben_Beng-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 310125, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "ben_Beng-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 338050, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "ben_Beng-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 308749, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "ben_Beng-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 304838, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 146.6435546875, + "max_sentence1_length": 538, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "brx_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 323609, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "brx_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 313313, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "brx_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 321711, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "brx_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 322789, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "brx_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 315285, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "brx_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 312131, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "brx_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 322797, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "brx_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 334755, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "brx_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 325455, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "brx_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 311373, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "brx_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 346327, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "brx_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 324475, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "brx_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 315825, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "brx_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 316110, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "brx_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 336917, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "brx_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 309554, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "brx_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 320770, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "brx_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 329423, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "brx_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 323112, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "brx_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 351037, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "brx_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 321736, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "brx_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 317825, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 159.326171875, + "max_sentence1_length": 631, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "doi_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 319020, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "doi_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 308724, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "doi_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 321711, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "doi_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 318200, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "doi_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 310696, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "doi_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 307542, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "doi_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 318208, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "doi_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 330166, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "doi_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 320866, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "doi_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 306784, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "doi_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 341738, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "doi_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 319886, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "doi_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 311236, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "doi_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 311521, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "doi_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 332328, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "doi_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 304965, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "doi_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 316181, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "doi_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 324834, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "doi_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 318523, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "doi_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 346448, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "doi_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 317147, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "doi_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 313236, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.8447265625, + "max_sentence1_length": 500, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "eng_Latn-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 320098, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "eng_Latn-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 309802, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "eng_Latn-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 322789, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "eng_Latn-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 318200, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "eng_Latn-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 311774, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "eng_Latn-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 308620, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "eng_Latn-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 319286, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "eng_Latn-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 331244, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "eng_Latn-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 321944, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "eng_Latn-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 307862, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "eng_Latn-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 342816, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "eng_Latn-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 320964, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "eng_Latn-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 312314, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "eng_Latn-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 312599, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "eng_Latn-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 333406, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "eng_Latn-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 306043, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "eng_Latn-san_Deva": { + "num_samples": 1024, + "number_of_characters": 317259, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "eng_Latn-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 325912, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "eng_Latn-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 319601, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "eng_Latn-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 347526, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "eng_Latn-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 318225, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "eng_Latn-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 314314, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 155.8974609375, + "max_sentence1_length": 532, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "gom_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 312594, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "gom_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 302298, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "gom_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 315285, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "gom_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 310696, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "gom_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 311774, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "gom_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 301116, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "gom_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 311782, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "gom_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 323740, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "gom_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 314440, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "gom_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 300358, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "gom_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 335312, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "gom_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 313460, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "gom_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 304810, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "gom_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 305095, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "gom_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 325902, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "gom_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 298539, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "gom_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 309755, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "gom_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 318408, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "gom_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 312097, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "gom_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 340022, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "gom_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 310721, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "gom_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 306810, + "unique_pairs": 1024, + "min_sentence1_length": 17, + "average_sentence1_length": 148.5693359375, + "max_sentence1_length": 537, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "guj_Gujr-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 309440, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "guj_Gujr-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 299144, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "guj_Gujr-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 312131, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "guj_Gujr-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 307542, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "guj_Gujr-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 308620, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "guj_Gujr-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 301116, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "guj_Gujr-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 308628, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "guj_Gujr-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 320586, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "guj_Gujr-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 311286, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "guj_Gujr-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 297204, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "guj_Gujr-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 332158, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "guj_Gujr-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 310306, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "guj_Gujr-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 301656, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "guj_Gujr-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 301941, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "guj_Gujr-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 322748, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "guj_Gujr-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 295385, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "guj_Gujr-san_Deva": { + "num_samples": 1024, + "number_of_characters": 306601, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "guj_Gujr-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 315254, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "guj_Gujr-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 308943, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "guj_Gujr-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 336868, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "guj_Gujr-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 307567, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "guj_Gujr-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 303656, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 145.4892578125, + "max_sentence1_length": 488, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "hin_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 320106, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "hin_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 309810, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "hin_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 322797, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "hin_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 318208, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "hin_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 319286, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "hin_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 311782, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "hin_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 308628, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "hin_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 331252, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "hin_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 321952, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "hin_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 307870, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "hin_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 342824, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "hin_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 320972, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "hin_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 312322, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "hin_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 312607, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "hin_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 333414, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "hin_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 306051, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "hin_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 317267, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "hin_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 325920, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "hin_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 319609, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "hin_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 347534, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "hin_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 318233, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "hin_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 314322, + "unique_pairs": 1024, + "min_sentence1_length": 21, + "average_sentence1_length": 155.9052734375, + "max_sentence1_length": 531, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "kan_Knda-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 332064, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "kan_Knda-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 321768, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "kan_Knda-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 334755, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "kan_Knda-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 330166, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "kan_Knda-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 331244, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "kan_Knda-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 323740, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "kan_Knda-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 320586, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "kan_Knda-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 331252, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "kan_Knda-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 333910, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "kan_Knda-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 319828, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "kan_Knda-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 354782, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "kan_Knda-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 332930, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "kan_Knda-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 324280, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "kan_Knda-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 324565, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "kan_Knda-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 345372, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "kan_Knda-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 318009, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "kan_Knda-san_Deva": { + "num_samples": 1024, + "number_of_characters": 329225, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "kan_Knda-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 337878, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "kan_Knda-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 331567, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "kan_Knda-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 359492, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "kan_Knda-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 330191, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "kan_Knda-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 326280, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 167.5830078125, + "max_sentence1_length": 668, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "kas_Arab-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 322764, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "kas_Arab-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 312468, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "kas_Arab-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 325455, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "kas_Arab-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 320866, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "kas_Arab-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 321944, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "kas_Arab-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 314440, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "kas_Arab-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 311286, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "kas_Arab-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 321952, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "kas_Arab-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 333910, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "kas_Arab-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 310528, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "kas_Arab-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 345482, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "kas_Arab-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 323630, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "kas_Arab-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 314980, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "kas_Arab-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 315265, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "kas_Arab-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 336072, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "kas_Arab-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 308709, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "kas_Arab-san_Deva": { + "num_samples": 1024, + "number_of_characters": 319925, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "kas_Arab-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 328578, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "kas_Arab-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 322267, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "kas_Arab-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 350192, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "kas_Arab-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 320891, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "kas_Arab-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 316980, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 158.5009765625, + "max_sentence1_length": 520, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "mai_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 308682, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "mai_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 298386, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "mai_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 311373, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "mai_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 306784, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "mai_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 307862, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "mai_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 300358, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "mai_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 297204, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "mai_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 307870, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "mai_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 319828, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "mai_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 310528, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "mai_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 331400, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "mai_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 309548, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "mai_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 300898, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "mai_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 301183, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "mai_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 321990, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "mai_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 294627, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "mai_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 305843, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "mai_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 314496, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "mai_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 308185, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "mai_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 336110, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "mai_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 306809, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "mai_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 302898, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 144.7490234375, + "max_sentence1_length": 562, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "mal_Mlym-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 343636, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "mal_Mlym-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 333340, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "mal_Mlym-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 346327, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "mal_Mlym-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 341738, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "mal_Mlym-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 342816, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "mal_Mlym-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 335312, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "mal_Mlym-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 332158, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "mal_Mlym-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 342824, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "mal_Mlym-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 354782, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "mal_Mlym-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 345482, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "mal_Mlym-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 331400, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "mal_Mlym-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 344502, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "mal_Mlym-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 335852, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "mal_Mlym-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 336137, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "mal_Mlym-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 356944, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "mal_Mlym-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 329581, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "mal_Mlym-san_Deva": { + "num_samples": 1024, + "number_of_characters": 340797, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "mal_Mlym-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 349450, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "mal_Mlym-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 343139, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "mal_Mlym-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 371064, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "mal_Mlym-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 341763, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "mal_Mlym-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 337852, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 178.8837890625, + "max_sentence1_length": 692, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "mar_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 321784, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "mar_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 311488, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "mar_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 324475, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "mar_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 319886, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "mar_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 320964, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "mar_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 313460, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "mar_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 310306, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "mar_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 320972, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "mar_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 332930, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "mar_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 323630, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "mar_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 309548, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "mar_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 344502, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "mar_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 314000, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "mar_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 314285, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "mar_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 335092, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "mar_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 307729, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "mar_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 318945, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "mar_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 327598, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "mar_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 321287, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "mar_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 349212, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "mar_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 319911, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "mar_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 316000, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 157.5439453125, + "max_sentence1_length": 555, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "mni_Mtei-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 313134, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "mni_Mtei-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 302838, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "mni_Mtei-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 315825, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "mni_Mtei-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 311236, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "mni_Mtei-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 312314, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "mni_Mtei-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 304810, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "mni_Mtei-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 301656, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "mni_Mtei-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 312322, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "mni_Mtei-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 324280, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "mni_Mtei-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 314980, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "mni_Mtei-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 300898, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "mni_Mtei-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 335852, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "mni_Mtei-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 314000, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "mni_Mtei-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 305635, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "mni_Mtei-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 326442, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "mni_Mtei-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 299079, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "mni_Mtei-san_Deva": { + "num_samples": 1024, + "number_of_characters": 310295, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "mni_Mtei-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 318948, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "mni_Mtei-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 312637, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "mni_Mtei-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 340562, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "mni_Mtei-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 311261, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "mni_Mtei-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 307350, + "unique_pairs": 1024, + "min_sentence1_length": 16, + "average_sentence1_length": 149.0966796875, + "max_sentence1_length": 597, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "npi_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 313419, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "npi_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 303123, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "npi_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 316110, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "npi_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 311521, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "npi_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 312599, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "npi_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 305095, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "npi_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 301941, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "npi_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 312607, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "npi_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 324565, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "npi_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 315265, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "npi_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 301183, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "npi_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 336137, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "npi_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 314285, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "npi_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 305635, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "npi_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 326727, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "npi_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 299364, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "npi_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 310580, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "npi_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 319233, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "npi_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 312922, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "npi_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 340847, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "npi_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 311546, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "npi_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 307635, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 149.375, + "max_sentence1_length": 525, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "ory_Orya-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 334226, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "ory_Orya-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 323930, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "ory_Orya-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 336917, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "ory_Orya-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 332328, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "ory_Orya-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 333406, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "ory_Orya-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 325902, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "ory_Orya-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 322748, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "ory_Orya-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 333414, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "ory_Orya-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 345372, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "ory_Orya-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 336072, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "ory_Orya-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 321990, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "ory_Orya-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 356944, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "ory_Orya-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 335092, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "ory_Orya-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 326442, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "ory_Orya-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 326727, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "ory_Orya-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 320171, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "ory_Orya-san_Deva": { + "num_samples": 1024, + "number_of_characters": 331387, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "ory_Orya-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 340040, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "ory_Orya-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 333729, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "ory_Orya-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 361654, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "ory_Orya-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 332353, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "ory_Orya-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 328442, + "unique_pairs": 1024, + "min_sentence1_length": 10, + "average_sentence1_length": 169.6943359375, + "max_sentence1_length": 578, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "pan_Guru-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 306863, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "pan_Guru-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 296567, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "pan_Guru-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 309554, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "pan_Guru-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 304965, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "pan_Guru-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 306043, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "pan_Guru-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 298539, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "pan_Guru-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 295385, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "pan_Guru-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 306051, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "pan_Guru-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 318009, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "pan_Guru-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 308709, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "pan_Guru-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 294627, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "pan_Guru-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 329581, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "pan_Guru-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 307729, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "pan_Guru-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 299079, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "pan_Guru-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 299364, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "pan_Guru-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 320171, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "pan_Guru-san_Deva": { + "num_samples": 1024, + "number_of_characters": 304024, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "pan_Guru-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 312677, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "pan_Guru-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 306366, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "pan_Guru-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 334291, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "pan_Guru-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 304990, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "pan_Guru-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 301079, + "unique_pairs": 1024, + "min_sentence1_length": 19, + "average_sentence1_length": 142.97265625, + "max_sentence1_length": 476, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "san_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 318079, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "san_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 307783, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "san_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 320770, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "san_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 316181, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "san_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 317259, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "san_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 309755, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "san_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 306601, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "san_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 317267, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "san_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 329225, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "san_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 319925, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "san_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 305843, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "san_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 340797, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "san_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 318945, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "san_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 310295, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "san_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 310580, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "san_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 331387, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "san_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 304024, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "san_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 323893, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "san_Deva-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 317582, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "san_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 345507, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "san_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 316206, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "san_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 312295, + "unique_pairs": 1024, + "min_sentence1_length": 9, + "average_sentence1_length": 153.92578125, + "max_sentence1_length": 601, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "sat_Olck-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 326732, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "sat_Olck-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 316436, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "sat_Olck-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 329423, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "sat_Olck-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 324834, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "sat_Olck-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 325912, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "sat_Olck-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 318408, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "sat_Olck-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 315254, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "sat_Olck-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 325920, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "sat_Olck-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 337878, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "sat_Olck-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 328578, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "sat_Olck-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 314496, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "sat_Olck-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 349450, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "sat_Olck-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 327598, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "sat_Olck-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 318948, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "sat_Olck-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 319233, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "sat_Olck-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 340040, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "sat_Olck-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 312677, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "sat_Olck-san_Deva": { + "num_samples": 1024, + "number_of_characters": 323893, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "sat_Olck-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 326235, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "sat_Olck-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 354160, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "sat_Olck-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 324859, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "sat_Olck-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 320948, + "unique_pairs": 1024, + "min_sentence1_length": 11, + "average_sentence1_length": 162.3759765625, + "max_sentence1_length": 536, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "snd_Deva-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 320421, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "snd_Deva-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 310125, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "snd_Deva-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 323112, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "snd_Deva-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 318523, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "snd_Deva-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 319601, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "snd_Deva-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 312097, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "snd_Deva-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 308943, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "snd_Deva-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 319609, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "snd_Deva-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 331567, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "snd_Deva-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 322267, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "snd_Deva-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 308185, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "snd_Deva-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 343139, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "snd_Deva-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 321287, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "snd_Deva-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 312637, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "snd_Deva-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 312922, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "snd_Deva-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 333729, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "snd_Deva-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 306366, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "snd_Deva-san_Deva": { + "num_samples": 1024, + "number_of_characters": 317582, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "snd_Deva-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 326235, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "snd_Deva-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 347849, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "snd_Deva-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 318548, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "snd_Deva-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 314637, + "unique_pairs": 1024, + "min_sentence1_length": 18, + "average_sentence1_length": 156.212890625, + "max_sentence1_length": 545, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "tam_Taml-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 348346, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "tam_Taml-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 338050, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "tam_Taml-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 351037, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "tam_Taml-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 346448, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "tam_Taml-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 347526, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "tam_Taml-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 340022, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "tam_Taml-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 336868, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "tam_Taml-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 347534, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "tam_Taml-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 359492, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "tam_Taml-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 350192, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "tam_Taml-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 336110, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "tam_Taml-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 371064, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "tam_Taml-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 349212, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "tam_Taml-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 340562, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "tam_Taml-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 340847, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "tam_Taml-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 361654, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "tam_Taml-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 334291, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "tam_Taml-san_Deva": { + "num_samples": 1024, + "number_of_characters": 345507, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "tam_Taml-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 354160, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "tam_Taml-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 347849, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "tam_Taml-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 346473, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + }, + "tam_Taml-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 342562, + "unique_pairs": 1024, + "min_sentence1_length": 32, + "average_sentence1_length": 183.4833984375, + "max_sentence1_length": 614, + "unique_sentence1": 1023, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "tel_Telu-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 319045, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "tel_Telu-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 308749, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "tel_Telu-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 321736, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "tel_Telu-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 317147, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "tel_Telu-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 318225, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "tel_Telu-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 310721, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "tel_Telu-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 307567, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "tel_Telu-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 318233, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "tel_Telu-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 330191, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "tel_Telu-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 320891, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "tel_Telu-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 306809, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "tel_Telu-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 341763, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "tel_Telu-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 319911, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "tel_Telu-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 311261, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "tel_Telu-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 311546, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "tel_Telu-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 332353, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "tel_Telu-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 304990, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "tel_Telu-san_Deva": { + "num_samples": 1024, + "number_of_characters": 316206, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "tel_Telu-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 324859, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "tel_Telu-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 318548, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "tel_Telu-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 346473, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "tel_Telu-urd_Arab": { + "num_samples": 1024, + "number_of_characters": 313261, + "unique_pairs": 1024, + "min_sentence1_length": 14, + "average_sentence1_length": 154.869140625, + "max_sentence1_length": 658, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 151.0498046875, + "max_sentence2_length": 574, + "unique_sentence2": 1024 + }, + "urd_Arab-asm_Beng": { + "num_samples": 1024, + "number_of_characters": 315134, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 156.6982421875, + "max_sentence2_length": 582, + "unique_sentence2": 1024 + }, + "urd_Arab-ben_Beng": { + "num_samples": 1024, + "number_of_characters": 304838, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 146.6435546875, + "max_sentence2_length": 538, + "unique_sentence2": 1024 + }, + "urd_Arab-brx_Deva": { + "num_samples": 1024, + "number_of_characters": 317825, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 159.326171875, + "max_sentence2_length": 631, + "unique_sentence2": 1024 + }, + "urd_Arab-doi_Deva": { + "num_samples": 1024, + "number_of_characters": 313236, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.8447265625, + "max_sentence2_length": 500, + "unique_sentence2": 1024 + }, + "urd_Arab-eng_Latn": { + "num_samples": 1024, + "number_of_characters": 314314, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 155.8974609375, + "max_sentence2_length": 532, + "unique_sentence2": 1024 + }, + "urd_Arab-gom_Deva": { + "num_samples": 1024, + "number_of_characters": 306810, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 17, + "average_sentence2_length": 148.5693359375, + "max_sentence2_length": 537, + "unique_sentence2": 1024 + }, + "urd_Arab-guj_Gujr": { + "num_samples": 1024, + "number_of_characters": 303656, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 145.4892578125, + "max_sentence2_length": 488, + "unique_sentence2": 1024 + }, + "urd_Arab-hin_Deva": { + "num_samples": 1024, + "number_of_characters": 314322, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 21, + "average_sentence2_length": 155.9052734375, + "max_sentence2_length": 531, + "unique_sentence2": 1024 + }, + "urd_Arab-kan_Knda": { + "num_samples": 1024, + "number_of_characters": 326280, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 167.5830078125, + "max_sentence2_length": 668, + "unique_sentence2": 1024 + }, + "urd_Arab-kas_Arab": { + "num_samples": 1024, + "number_of_characters": 316980, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 158.5009765625, + "max_sentence2_length": 520, + "unique_sentence2": 1024 + }, + "urd_Arab-mai_Deva": { + "num_samples": 1024, + "number_of_characters": 302898, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 144.7490234375, + "max_sentence2_length": 562, + "unique_sentence2": 1024 + }, + "urd_Arab-mal_Mlym": { + "num_samples": 1024, + "number_of_characters": 337852, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 13, + "average_sentence2_length": 178.8837890625, + "max_sentence2_length": 692, + "unique_sentence2": 1024 + }, + "urd_Arab-mar_Deva": { + "num_samples": 1024, + "number_of_characters": 316000, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 157.5439453125, + "max_sentence2_length": 555, + "unique_sentence2": 1024 + }, + "urd_Arab-mni_Mtei": { + "num_samples": 1024, + "number_of_characters": 307350, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 16, + "average_sentence2_length": 149.0966796875, + "max_sentence2_length": 597, + "unique_sentence2": 1024 + }, + "urd_Arab-npi_Deva": { + "num_samples": 1024, + "number_of_characters": 307635, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 149.375, + "max_sentence2_length": 525, + "unique_sentence2": 1024 + }, + "urd_Arab-ory_Orya": { + "num_samples": 1024, + "number_of_characters": 328442, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 10, + "average_sentence2_length": 169.6943359375, + "max_sentence2_length": 578, + "unique_sentence2": 1024 + }, + "urd_Arab-pan_Guru": { + "num_samples": 1024, + "number_of_characters": 301079, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 19, + "average_sentence2_length": 142.97265625, + "max_sentence2_length": 476, + "unique_sentence2": 1024 + }, + "urd_Arab-san_Deva": { + "num_samples": 1024, + "number_of_characters": 312295, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 9, + "average_sentence2_length": 153.92578125, + "max_sentence2_length": 601, + "unique_sentence2": 1024 + }, + "urd_Arab-sat_Olck": { + "num_samples": 1024, + "number_of_characters": 320948, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 11, + "average_sentence2_length": 162.3759765625, + "max_sentence2_length": 536, + "unique_sentence2": 1024 + }, + "urd_Arab-snd_Deva": { + "num_samples": 1024, + "number_of_characters": 314637, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 18, + "average_sentence2_length": 156.212890625, + "max_sentence2_length": 545, + "unique_sentence2": 1024 + }, + "urd_Arab-tam_Taml": { + "num_samples": 1024, + "number_of_characters": 342562, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 32, + "average_sentence2_length": 183.4833984375, + "max_sentence2_length": 614, + "unique_sentence2": 1023 + }, + "urd_Arab-tel_Telu": { + "num_samples": 1024, + "number_of_characters": 313261, + "unique_pairs": 1024, + "min_sentence1_length": 13, + "average_sentence1_length": 151.0498046875, + "max_sentence1_length": 574, + "unique_sentence1": 1024, + "min_sentence2_length": 14, + "average_sentence2_length": 154.869140625, + "max_sentence2_length": 658, + "unique_sentence2": 1024 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/IWSLT2017BitextMining.json b/mteb/descriptive_stats/BitextMining/IWSLT2017BitextMining.json new file mode 100644 index 000000000..504c3f190 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/IWSLT2017BitextMining.json @@ -0,0 +1,329 @@ +{ + "validation": { + "num_samples": 21938, + "number_of_characters": 4256244, + "unique_pairs": 21840, + "min_sentence1_length": 2, + "average_sentence1_length": 97.0061992889051, + "max_sentence1_length": 521, + "unique_sentence1": 11563, + "min_sentence2_length": 2, + "average_sentence2_length": 97.0061992889051, + "max_sentence2_length": 521, + "unique_sentence2": 11563, + "hf_subset_descriptive_stats": { + "ar-en": { + "num_samples": 888, + "number_of_characters": 172499, + "unique_pairs": 887, + "min_sentence1_length": 4, + "average_sentence1_length": 85.48873873873873, + "max_sentence1_length": 369, + "unique_sentence1": 887, + "min_sentence2_length": 10, + "average_sentence2_length": 108.76689189189189, + "max_sentence2_length": 462, + "unique_sentence2": 881 + }, + "de-en": { + "num_samples": 888, + "number_of_characters": 202336, + "unique_pairs": 883, + "min_sentence1_length": 6, + "average_sentence1_length": 119.02702702702703, + "max_sentence1_length": 521, + "unique_sentence1": 881, + "min_sentence2_length": 10, + "average_sentence2_length": 108.82882882882883, + "max_sentence2_length": 462, + "unique_sentence2": 881 + }, + "en-ar": { + "num_samples": 888, + "number_of_characters": 172499, + "unique_pairs": 887, + "min_sentence1_length": 10, + "average_sentence1_length": 108.76689189189189, + "max_sentence1_length": 462, + "unique_sentence1": 881, + "min_sentence2_length": 4, + "average_sentence2_length": 85.48873873873873, + "max_sentence2_length": 369, + "unique_sentence2": 887 + }, + "en-de": { + "num_samples": 888, + "number_of_characters": 202336, + "unique_pairs": 883, + "min_sentence1_length": 10, + "average_sentence1_length": 108.82882882882883, + "max_sentence1_length": 462, + "unique_sentence1": 881, + "min_sentence2_length": 6, + "average_sentence2_length": 119.02702702702703, + "max_sentence2_length": 521, + "unique_sentence2": 881 + }, + "en-fr": { + "num_samples": 890, + "number_of_characters": 197619, + "unique_pairs": 883, + "min_sentence1_length": 10, + "average_sentence1_length": 108.4123595505618, + "max_sentence1_length": 462, + "unique_sentence1": 883, + "min_sentence2_length": 6, + "average_sentence2_length": 113.63146067415731, + "max_sentence2_length": 493, + "unique_sentence2": 881 + }, + "en-it": { + "num_samples": 929, + "number_of_characters": 191803, + "unique_pairs": 924, + "min_sentence1_length": 10, + "average_sentence1_length": 103.0010764262648, + "max_sentence1_length": 433, + "unique_sentence1": 922, + "min_sentence2_length": 7, + "average_sentence2_length": 103.46071044133477, + "max_sentence2_length": 444, + "unique_sentence2": 918 + }, + "en-ja": { + "num_samples": 871, + "number_of_characters": 132742, + "unique_pairs": 867, + "min_sentence1_length": 10, + "average_sentence1_length": 109.80826636050517, + "max_sentence1_length": 462, + "unique_sentence1": 864, + "min_sentence2_length": 5, + "average_sentence2_length": 42.59357060849598, + "max_sentence2_length": 225, + "unique_sentence2": 866 + }, + "en-ko": { + "num_samples": 879, + "number_of_characters": 142659, + "unique_pairs": 874, + "min_sentence1_length": 10, + "average_sentence1_length": 107.74175199089875, + "max_sentence1_length": 462, + "unique_sentence1": 872, + "min_sentence2_length": 3, + "average_sentence2_length": 54.5551763367463, + "max_sentence2_length": 250, + "unique_sentence2": 872 + }, + "en-nl": { + "num_samples": 1003, + "number_of_characters": 189637, + "unique_pairs": 1000, + "min_sentence1_length": 10, + "average_sentence1_length": 95.26819541375872, + "max_sentence1_length": 433, + "unique_sentence1": 996, + "min_sentence2_length": 4, + "average_sentence2_length": 93.80159521435692, + "max_sentence2_length": 477, + "unique_sentence2": 1000 + }, + "en-ro": { + "num_samples": 914, + "number_of_characters": 194128, + "unique_pairs": 910, + "min_sentence1_length": 10, + "average_sentence1_length": 104.72100656455142, + "max_sentence1_length": 433, + "unique_sentence1": 907, + "min_sentence2_length": 9, + "average_sentence2_length": 107.67286652078775, + "max_sentence2_length": 448, + "unique_sentence2": 910 + }, + "en-zh": { + "num_samples": 879, + "number_of_characters": 131126, + "unique_pairs": 877, + "min_sentence1_length": 10, + "average_sentence1_length": 109.36518771331058, + "max_sentence1_length": 462, + "unique_sentence1": 872, + "min_sentence2_length": 2, + "average_sentence2_length": 39.811149032992034, + "max_sentence2_length": 230, + "unique_sentence2": 867 + }, + "fr-en": { + "num_samples": 890, + "number_of_characters": 197619, + "unique_pairs": 883, + "min_sentence1_length": 6, + "average_sentence1_length": 113.63146067415731, + "max_sentence1_length": 493, + "unique_sentence1": 881, + "min_sentence2_length": 10, + "average_sentence2_length": 108.4123595505618, + "max_sentence2_length": 462, + "unique_sentence2": 883 + }, + "it-en": { + "num_samples": 929, + "number_of_characters": 191803, + "unique_pairs": 924, + "min_sentence1_length": 7, + "average_sentence1_length": 103.46071044133477, + "max_sentence1_length": 444, + "unique_sentence1": 918, + "min_sentence2_length": 10, + "average_sentence2_length": 103.0010764262648, + "max_sentence2_length": 433, + "unique_sentence2": 922 + }, + "it-nl": { + "num_samples": 1001, + "number_of_characters": 188858, + "unique_pairs": 998, + "min_sentence1_length": 7, + "average_sentence1_length": 94.64235764235764, + "max_sentence1_length": 459, + "unique_sentence1": 994, + "min_sentence2_length": 7, + "average_sentence2_length": 94.02697302697302, + "max_sentence2_length": 505, + "unique_sentence2": 998 + }, + "it-ro": { + "num_samples": 914, + "number_of_characters": 193339, + "unique_pairs": 911, + "min_sentence1_length": 7, + "average_sentence1_length": 103.90809628008753, + "max_sentence1_length": 435, + "unique_sentence1": 907, + "min_sentence2_length": 9, + "average_sentence2_length": 107.62253829321664, + "max_sentence2_length": 448, + "unique_sentence2": 910 + }, + "ja-en": { + "num_samples": 871, + "number_of_characters": 132742, + "unique_pairs": 867, + "min_sentence1_length": 5, + "average_sentence1_length": 42.59357060849598, + "max_sentence1_length": 225, + "unique_sentence1": 866, + "min_sentence2_length": 10, + "average_sentence2_length": 109.80826636050517, + "max_sentence2_length": 462, + "unique_sentence2": 864 + }, + "ko-en": { + "num_samples": 879, + "number_of_characters": 142659, + "unique_pairs": 874, + "min_sentence1_length": 3, + "average_sentence1_length": 54.5551763367463, + "max_sentence1_length": 250, + "unique_sentence1": 872, + "min_sentence2_length": 10, + "average_sentence2_length": 107.74175199089875, + "max_sentence2_length": 462, + "unique_sentence2": 872 + }, + "nl-en": { + "num_samples": 1003, + "number_of_characters": 189637, + "unique_pairs": 1000, + "min_sentence1_length": 4, + "average_sentence1_length": 93.80159521435692, + "max_sentence1_length": 477, + "unique_sentence1": 1000, + "min_sentence2_length": 10, + "average_sentence2_length": 95.26819541375872, + "max_sentence2_length": 433, + "unique_sentence2": 996 + }, + "nl-it": { + "num_samples": 1001, + "number_of_characters": 188858, + "unique_pairs": 998, + "min_sentence1_length": 7, + "average_sentence1_length": 94.02697302697302, + "max_sentence1_length": 505, + "unique_sentence1": 998, + "min_sentence2_length": 7, + "average_sentence2_length": 94.64235764235764, + "max_sentence2_length": 459, + "unique_sentence2": 994 + }, + "nl-ro": { + "num_samples": 913, + "number_of_characters": 191376, + "unique_pairs": 911, + "min_sentence1_length": 7, + "average_sentence1_length": 102.01971522453451, + "max_sentence1_length": 478, + "unique_sentence1": 909, + "min_sentence2_length": 9, + "average_sentence2_length": 107.59255202628697, + "max_sentence2_length": 515, + "unique_sentence2": 909 + }, + "ro-en": { + "num_samples": 914, + "number_of_characters": 194128, + "unique_pairs": 910, + "min_sentence1_length": 9, + "average_sentence1_length": 107.67286652078775, + "max_sentence1_length": 448, + "unique_sentence1": 910, + "min_sentence2_length": 10, + "average_sentence2_length": 104.72100656455142, + "max_sentence2_length": 433, + "unique_sentence2": 907 + }, + "ro-it": { + "num_samples": 914, + "number_of_characters": 193339, + "unique_pairs": 911, + "min_sentence1_length": 9, + "average_sentence1_length": 107.62253829321664, + "max_sentence1_length": 448, + "unique_sentence1": 910, + "min_sentence2_length": 7, + "average_sentence2_length": 103.90809628008753, + "max_sentence2_length": 435, + "unique_sentence2": 907 + }, + "ro-nl": { + "num_samples": 913, + "number_of_characters": 191376, + "unique_pairs": 911, + "min_sentence1_length": 9, + "average_sentence1_length": 107.59255202628697, + "max_sentence1_length": 515, + "unique_sentence1": 909, + "min_sentence2_length": 7, + "average_sentence2_length": 102.01971522453451, + "max_sentence2_length": 478, + "unique_sentence2": 909 + }, + "zh-en": { + "num_samples": 879, + "number_of_characters": 131126, + "unique_pairs": 877, + "min_sentence1_length": 2, + "average_sentence1_length": 39.811149032992034, + "max_sentence1_length": 230, + "unique_sentence1": 867, + "min_sentence2_length": 10, + "average_sentence2_length": 109.36518771331058, + "max_sentence2_length": 462, + "unique_sentence2": 872 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/IndicGenBenchFloresBitextMining.json b/mteb/descriptive_stats/BitextMining/IndicGenBenchFloresBitextMining.json new file mode 100644 index 000000000..1aaed3945 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/IndicGenBenchFloresBitextMining.json @@ -0,0 +1,1540 @@ +{ + "validation": { + "num_samples": 57826, + "number_of_characters": 14600950, + "unique_pairs": 57826, + "min_sentence1_length": 24, + "average_sentence1_length": 126.2541071490333, + "max_sentence1_length": 368, + "unique_sentence1": 29903, + "min_sentence2_length": 24, + "average_sentence2_length": 126.24390412617161, + "max_sentence2_length": 368, + "unique_sentence2": 29903, + "hf_subset_descriptive_stats": { + "ben-eng": { + "num_samples": 997, + "number_of_characters": 248469, + "unique_pairs": 997, + "min_sentence1_length": 30, + "average_sentence1_length": 123.64593781344033, + "max_sentence1_length": 320, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-ben": { + "num_samples": 997, + "number_of_characters": 248469, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 30, + "average_sentence2_length": 123.64593781344033, + "max_sentence2_length": 320, + "unique_sentence2": 997 + }, + "guj-eng": { + "num_samples": 997, + "number_of_characters": 245477, + "unique_pairs": 997, + "min_sentence1_length": 30, + "average_sentence1_length": 120.64493480441324, + "max_sentence1_length": 368, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-guj": { + "num_samples": 997, + "number_of_characters": 245477, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 30, + "average_sentence2_length": 120.64493480441324, + "max_sentence2_length": 368, + "unique_sentence2": 997 + }, + "hin-eng": { + "num_samples": 997, + "number_of_characters": 250573, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 125.75626880641926, + "max_sentence1_length": 355, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-hin": { + "num_samples": 997, + "number_of_characters": 250564, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 125.74724172517553, + "max_sentence2_length": 355, + "unique_sentence2": 997 + }, + "kan-eng": { + "num_samples": 997, + "number_of_characters": 257131, + "unique_pairs": 997, + "min_sentence1_length": 34, + "average_sentence1_length": 132.33400200601807, + "max_sentence1_length": 331, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-kan": { + "num_samples": 997, + "number_of_characters": 256986, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 34, + "average_sentence2_length": 132.18856569709126, + "max_sentence2_length": 331, + "unique_sentence2": 997 + }, + "mal-eng": { + "num_samples": 997, + "number_of_characters": 267295, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 142.52858575727183, + "max_sentence1_length": 360, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mal": { + "num_samples": 997, + "number_of_characters": 267296, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 142.5295887662989, + "max_sentence2_length": 360, + "unique_sentence2": 997 + }, + "mar-eng": { + "num_samples": 997, + "number_of_characters": 251107, + "unique_pairs": 997, + "min_sentence1_length": 29, + "average_sentence1_length": 126.29187562688064, + "max_sentence1_length": 321, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mar": { + "num_samples": 997, + "number_of_characters": 250897, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 29, + "average_sentence2_length": 126.08124373119358, + "max_sentence2_length": 321, + "unique_sentence2": 997 + }, + "tam-eng": { + "num_samples": 997, + "number_of_characters": 271322, + "unique_pairs": 997, + "min_sentence1_length": 30, + "average_sentence1_length": 146.567703109328, + "max_sentence1_length": 358, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-tam": { + "num_samples": 997, + "number_of_characters": 271322, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 30, + "average_sentence2_length": 146.567703109328, + "max_sentence2_length": 358, + "unique_sentence2": 997 + }, + "tel-eng": { + "num_samples": 997, + "number_of_characters": 252385, + "unique_pairs": 997, + "min_sentence1_length": 29, + "average_sentence1_length": 127.57372116349048, + "max_sentence1_length": 317, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-tel": { + "num_samples": 997, + "number_of_characters": 252380, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 29, + "average_sentence2_length": 127.56870611835507, + "max_sentence2_length": 317, + "unique_sentence2": 997 + }, + "urd-eng": { + "num_samples": 997, + "number_of_characters": 249824, + "unique_pairs": 997, + "min_sentence1_length": 37, + "average_sentence1_length": 125.00501504513541, + "max_sentence1_length": 295, + "unique_sentence1": 996, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-urd": { + "num_samples": 997, + "number_of_characters": 249824, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 37, + "average_sentence2_length": 125.00501504513541, + "max_sentence2_length": 295, + "unique_sentence2": 996 + }, + "asm-eng": { + "num_samples": 997, + "number_of_characters": 246220, + "unique_pairs": 997, + "min_sentence1_length": 30, + "average_sentence1_length": 121.3901705115346, + "max_sentence1_length": 314, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-asm": { + "num_samples": 997, + "number_of_characters": 246224, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 30, + "average_sentence2_length": 121.39418254764293, + "max_sentence2_length": 314, + "unique_sentence2": 997 + }, + "bho-eng": { + "num_samples": 997, + "number_of_characters": 246895, + "unique_pairs": 997, + "min_sentence1_length": 25, + "average_sentence1_length": 122.06720160481444, + "max_sentence1_length": 326, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-bho": { + "num_samples": 997, + "number_of_characters": 246919, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 25, + "average_sentence2_length": 122.0912738214644, + "max_sentence2_length": 326, + "unique_sentence2": 997 + }, + "nep-eng": { + "num_samples": 997, + "number_of_characters": 245984, + "unique_pairs": 997, + "min_sentence1_length": 24, + "average_sentence1_length": 121.15346038114343, + "max_sentence1_length": 307, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-nep": { + "num_samples": 997, + "number_of_characters": 245984, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 24, + "average_sentence2_length": 121.15346038114343, + "max_sentence2_length": 307, + "unique_sentence2": 997 + }, + "ory-eng": { + "num_samples": 997, + "number_of_characters": 254206, + "unique_pairs": 997, + "min_sentence1_length": 34, + "average_sentence1_length": 129.4002006018054, + "max_sentence1_length": 308, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-ory": { + "num_samples": 997, + "number_of_characters": 254206, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 34, + "average_sentence2_length": 129.4002006018054, + "max_sentence2_length": 308, + "unique_sentence2": 997 + }, + "pan-eng": { + "num_samples": 997, + "number_of_characters": 251598, + "unique_pairs": 997, + "min_sentence1_length": 29, + "average_sentence1_length": 126.78435305917753, + "max_sentence1_length": 309, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-pan": { + "num_samples": 997, + "number_of_characters": 251597, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 29, + "average_sentence2_length": 126.78335005015045, + "max_sentence2_length": 309, + "unique_sentence2": 997 + }, + "pus-eng": { + "num_samples": 997, + "number_of_characters": 247450, + "unique_pairs": 997, + "min_sentence1_length": 32, + "average_sentence1_length": 122.62387161484453, + "max_sentence1_length": 300, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-pus": { + "num_samples": 997, + "number_of_characters": 247450, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 32, + "average_sentence2_length": 122.62387161484453, + "max_sentence2_length": 300, + "unique_sentence2": 997 + }, + "san-eng": { + "num_samples": 997, + "number_of_characters": 249042, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 124.22066198595788, + "max_sentence1_length": 311, + "unique_sentence1": 994, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-san": { + "num_samples": 997, + "number_of_characters": 248877, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 124.05516549648947, + "max_sentence2_length": 311, + "unique_sentence2": 994 + }, + "awa-eng": { + "num_samples": 997, + "number_of_characters": 247944, + "unique_pairs": 997, + "min_sentence1_length": 34, + "average_sentence1_length": 123.11935807422267, + "max_sentence1_length": 329, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-awa": { + "num_samples": 997, + "number_of_characters": 247884, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 34, + "average_sentence2_length": 123.05917753259779, + "max_sentence2_length": 329, + "unique_sentence2": 997 + }, + "bgc-eng": { + "num_samples": 997, + "number_of_characters": 245935, + "unique_pairs": 997, + "min_sentence1_length": 27, + "average_sentence1_length": 121.10431293881645, + "max_sentence1_length": 303, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-bgc": { + "num_samples": 997, + "number_of_characters": 245935, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 27, + "average_sentence2_length": 121.10431293881645, + "max_sentence2_length": 303, + "unique_sentence2": 997 + }, + "bod-eng": { + "num_samples": 997, + "number_of_characters": 266515, + "unique_pairs": 997, + "min_sentence1_length": 26, + "average_sentence1_length": 141.74623871614844, + "max_sentence1_length": 355, + "unique_sentence1": 996, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-bod": { + "num_samples": 997, + "number_of_characters": 266495, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 26, + "average_sentence2_length": 141.72617853560683, + "max_sentence2_length": 355, + "unique_sentence2": 996 + }, + "boy-eng": { + "num_samples": 997, + "number_of_characters": 260174, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 135.38615847542627, + "max_sentence1_length": 312, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-boy": { + "num_samples": 997, + "number_of_characters": 260174, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 135.38615847542627, + "max_sentence2_length": 312, + "unique_sentence2": 997 + }, + "gbm-eng": { + "num_samples": 997, + "number_of_characters": 247009, + "unique_pairs": 997, + "min_sentence1_length": 30, + "average_sentence1_length": 122.18154463390171, + "max_sentence1_length": 344, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-gbm": { + "num_samples": 997, + "number_of_characters": 247009, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 30, + "average_sentence2_length": 122.18154463390171, + "max_sentence2_length": 344, + "unique_sentence2": 997 + }, + "gom-eng": { + "num_samples": 997, + "number_of_characters": 244553, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 119.71815446339016, + "max_sentence1_length": 306, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-gom": { + "num_samples": 997, + "number_of_characters": 244553, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 119.71815446339016, + "max_sentence2_length": 306, + "unique_sentence2": 997 + }, + "hne-eng": { + "num_samples": 997, + "number_of_characters": 246416, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 121.58676028084253, + "max_sentence1_length": 321, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-hne": { + "num_samples": 997, + "number_of_characters": 246405, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 121.57572718154464, + "max_sentence2_length": 321, + "unique_sentence2": 997 + }, + "raj-eng": { + "num_samples": 997, + "number_of_characters": 249541, + "unique_pairs": 997, + "min_sentence1_length": 32, + "average_sentence1_length": 124.72116349047141, + "max_sentence1_length": 313, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-raj": { + "num_samples": 997, + "number_of_characters": 249541, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 32, + "average_sentence2_length": 124.72116349047141, + "max_sentence2_length": 313, + "unique_sentence2": 997 + }, + "mai-eng": { + "num_samples": 997, + "number_of_characters": 247991, + "unique_pairs": 997, + "min_sentence1_length": 29, + "average_sentence1_length": 123.16649949849548, + "max_sentence1_length": 312, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mai": { + "num_samples": 997, + "number_of_characters": 247994, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 29, + "average_sentence2_length": 123.16950852557673, + "max_sentence2_length": 312, + "unique_sentence2": 997 + }, + "mni-eng": { + "num_samples": 997, + "number_of_characters": 254308, + "unique_pairs": 997, + "min_sentence1_length": 39, + "average_sentence1_length": 129.5025075225677, + "max_sentence1_length": 310, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mni": { + "num_samples": 997, + "number_of_characters": 254312, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 39, + "average_sentence2_length": 129.50651955867602, + "max_sentence2_length": 310, + "unique_sentence2": 997 + }, + "mup-eng": { + "num_samples": 997, + "number_of_characters": 248486, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 123.6629889669007, + "max_sentence1_length": 312, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mup": { + "num_samples": 997, + "number_of_characters": 248486, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 123.6629889669007, + "max_sentence2_length": 312, + "unique_sentence2": 997 + }, + "mwr-eng": { + "num_samples": 997, + "number_of_characters": 248641, + "unique_pairs": 997, + "min_sentence1_length": 31, + "average_sentence1_length": 123.81845536609829, + "max_sentence1_length": 324, + "unique_sentence1": 997, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-mwr": { + "num_samples": 997, + "number_of_characters": 248641, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 31, + "average_sentence2_length": 123.81845536609829, + "max_sentence2_length": 324, + "unique_sentence2": 997 + }, + "sat-eng": { + "num_samples": 997, + "number_of_characters": 258279, + "unique_pairs": 997, + "min_sentence1_length": 37, + "average_sentence1_length": 133.4854563691073, + "max_sentence1_length": 333, + "unique_sentence1": 995, + "min_sentence2_length": 28, + "average_sentence2_length": 125.57071213640923, + "max_sentence2_length": 297, + "unique_sentence2": 997 + }, + "eng-sat": { + "num_samples": 997, + "number_of_characters": 258279, + "unique_pairs": 997, + "min_sentence1_length": 28, + "average_sentence1_length": 125.57071213640923, + "max_sentence1_length": 297, + "unique_sentence1": 997, + "min_sentence2_length": 37, + "average_sentence2_length": 133.4854563691073, + "max_sentence2_length": 333, + "unique_sentence2": 995 + } + } + }, + "test": { + "num_samples": 58696, + "number_of_characters": 15359416, + "unique_pairs": 58690, + "min_sentence1_length": 33, + "average_sentence1_length": 130.84266389532507, + "max_sentence1_length": 431, + "unique_sentence1": 30351, + "min_sentence2_length": 33, + "average_sentence2_length": 130.834724683113, + "max_sentence2_length": 431, + "unique_sentence2": 30351, + "hf_subset_descriptive_stats": { + "ben-eng": { + "num_samples": 1012, + "number_of_characters": 261008, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 127.51185770750988, + "max_sentence1_length": 333, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-ben": { + "num_samples": 1012, + "number_of_characters": 261008, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 127.51185770750988, + "max_sentence2_length": 333, + "unique_sentence2": 1012 + }, + "guj-eng": { + "num_samples": 1012, + "number_of_characters": 258394, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 124.92885375494072, + "max_sentence1_length": 349, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-guj": { + "num_samples": 1012, + "number_of_characters": 258394, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 124.92885375494072, + "max_sentence2_length": 349, + "unique_sentence2": 1012 + }, + "hin-eng": { + "num_samples": 1012, + "number_of_characters": 263040, + "unique_pairs": 1012, + "min_sentence1_length": 41, + "average_sentence1_length": 129.5197628458498, + "max_sentence1_length": 381, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-hin": { + "num_samples": 1012, + "number_of_characters": 263029, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 41, + "average_sentence2_length": 129.5088932806324, + "max_sentence2_length": 381, + "unique_sentence2": 1012 + }, + "kan-eng": { + "num_samples": 1012, + "number_of_characters": 270091, + "unique_pairs": 1012, + "min_sentence1_length": 43, + "average_sentence1_length": 136.48715415019763, + "max_sentence1_length": 388, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-kan": { + "num_samples": 1012, + "number_of_characters": 270021, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 43, + "average_sentence2_length": 136.4179841897233, + "max_sentence2_length": 388, + "unique_sentence2": 1012 + }, + "mal-eng": { + "num_samples": 1012, + "number_of_characters": 281302, + "unique_pairs": 1012, + "min_sentence1_length": 48, + "average_sentence1_length": 147.56521739130434, + "max_sentence1_length": 376, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mal": { + "num_samples": 1012, + "number_of_characters": 281302, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 48, + "average_sentence2_length": 147.56521739130434, + "max_sentence2_length": 376, + "unique_sentence2": 1012 + }, + "mar-eng": { + "num_samples": 1012, + "number_of_characters": 265212, + "unique_pairs": 1012, + "min_sentence1_length": 34, + "average_sentence1_length": 131.66600790513834, + "max_sentence1_length": 356, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mar": { + "num_samples": 1012, + "number_of_characters": 265023, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 34, + "average_sentence2_length": 131.47924901185772, + "max_sentence2_length": 355, + "unique_sentence2": 1012 + }, + "tam-eng": { + "num_samples": 1012, + "number_of_characters": 286099, + "unique_pairs": 1012, + "min_sentence1_length": 48, + "average_sentence1_length": 152.30533596837944, + "max_sentence1_length": 404, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-tam": { + "num_samples": 1012, + "number_of_characters": 286099, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 48, + "average_sentence2_length": 152.30533596837944, + "max_sentence2_length": 404, + "unique_sentence2": 1012 + }, + "tel-eng": { + "num_samples": 1012, + "number_of_characters": 264460, + "unique_pairs": 1012, + "min_sentence1_length": 39, + "average_sentence1_length": 130.92292490118578, + "max_sentence1_length": 359, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-tel": { + "num_samples": 1012, + "number_of_characters": 264447, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 39, + "average_sentence2_length": 130.9100790513834, + "max_sentence2_length": 359, + "unique_sentence2": 1012 + }, + "urd-eng": { + "num_samples": 1012, + "number_of_characters": 261886, + "unique_pairs": 1012, + "min_sentence1_length": 34, + "average_sentence1_length": 128.37944664031622, + "max_sentence1_length": 348, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-urd": { + "num_samples": 1012, + "number_of_characters": 261885, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 34, + "average_sentence2_length": 128.37845849802372, + "max_sentence2_length": 348, + "unique_sentence2": 1012 + }, + "asm-eng": { + "num_samples": 1012, + "number_of_characters": 257902, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 124.44268774703558, + "max_sentence1_length": 329, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-asm": { + "num_samples": 1012, + "number_of_characters": 257909, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 124.449604743083, + "max_sentence2_length": 329, + "unique_sentence2": 1012 + }, + "bho-eng": { + "num_samples": 1012, + "number_of_characters": 260578, + "unique_pairs": 1012, + "min_sentence1_length": 36, + "average_sentence1_length": 127.08695652173913, + "max_sentence1_length": 367, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-bho": { + "num_samples": 1012, + "number_of_characters": 260601, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 36, + "average_sentence2_length": 127.1096837944664, + "max_sentence2_length": 367, + "unique_sentence2": 1012 + }, + "nep-eng": { + "num_samples": 1012, + "number_of_characters": 258869, + "unique_pairs": 1012, + "min_sentence1_length": 34, + "average_sentence1_length": 125.39822134387352, + "max_sentence1_length": 362, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-nep": { + "num_samples": 1012, + "number_of_characters": 258869, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 34, + "average_sentence2_length": 125.39822134387352, + "max_sentence2_length": 362, + "unique_sentence2": 1012 + }, + "ory-eng": { + "num_samples": 1012, + "number_of_characters": 266805, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 133.2401185770751, + "max_sentence1_length": 354, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-ory": { + "num_samples": 1012, + "number_of_characters": 266805, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 133.2401185770751, + "max_sentence2_length": 354, + "unique_sentence2": 1012 + }, + "pan-eng": { + "num_samples": 1012, + "number_of_characters": 265391, + "unique_pairs": 1012, + "min_sentence1_length": 37, + "average_sentence1_length": 131.84288537549406, + "max_sentence1_length": 380, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-pan": { + "num_samples": 1012, + "number_of_characters": 265391, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 37, + "average_sentence2_length": 131.84288537549406, + "max_sentence2_length": 380, + "unique_sentence2": 1012 + }, + "pus-eng": { + "num_samples": 1012, + "number_of_characters": 254422, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 121.00395256916995, + "max_sentence1_length": 325, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-pus": { + "num_samples": 1012, + "number_of_characters": 254421, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 121.00296442687747, + "max_sentence2_length": 325, + "unique_sentence2": 1012 + }, + "san-eng": { + "num_samples": 1012, + "number_of_characters": 260339, + "unique_pairs": 1012, + "min_sentence1_length": 33, + "average_sentence1_length": 126.85079051383399, + "max_sentence1_length": 358, + "unique_sentence1": 1011, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-san": { + "num_samples": 1012, + "number_of_characters": 260224, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 33, + "average_sentence2_length": 126.73715415019763, + "max_sentence2_length": 358, + "unique_sentence2": 1011 + }, + "awa-eng": { + "num_samples": 1012, + "number_of_characters": 260179, + "unique_pairs": 1012, + "min_sentence1_length": 34, + "average_sentence1_length": 126.69268774703558, + "max_sentence1_length": 378, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-awa": { + "num_samples": 1012, + "number_of_characters": 260137, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 34, + "average_sentence2_length": 126.65118577075098, + "max_sentence2_length": 378, + "unique_sentence2": 1012 + }, + "bgc-eng": { + "num_samples": 1012, + "number_of_characters": 257450, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 123.99604743083005, + "max_sentence1_length": 332, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-bgc": { + "num_samples": 1012, + "number_of_characters": 257450, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 123.99604743083005, + "max_sentence2_length": 332, + "unique_sentence2": 1012 + }, + "bod-eng": { + "num_samples": 1012, + "number_of_characters": 280188, + "unique_pairs": 1012, + "min_sentence1_length": 42, + "average_sentence1_length": 146.46442687747034, + "max_sentence1_length": 431, + "unique_sentence1": 1009, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-bod": { + "num_samples": 1012, + "number_of_characters": 280126, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 42, + "average_sentence2_length": 146.40316205533597, + "max_sentence2_length": 431, + "unique_sentence2": 1009 + }, + "boy-eng": { + "num_samples": 1012, + "number_of_characters": 277538, + "unique_pairs": 1012, + "min_sentence1_length": 36, + "average_sentence1_length": 143.84584980237153, + "max_sentence1_length": 396, + "unique_sentence1": 1011, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-boy": { + "num_samples": 1012, + "number_of_characters": 277538, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 36, + "average_sentence2_length": 143.84584980237153, + "max_sentence2_length": 396, + "unique_sentence2": 1011 + }, + "gbm-eng": { + "num_samples": 1012, + "number_of_characters": 261027, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 127.53063241106719, + "max_sentence1_length": 333, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-gbm": { + "num_samples": 1012, + "number_of_characters": 261027, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 127.53063241106719, + "max_sentence2_length": 333, + "unique_sentence2": 1012 + }, + "gom-eng": { + "num_samples": 1012, + "number_of_characters": 259182, + "unique_pairs": 1012, + "min_sentence1_length": 37, + "average_sentence1_length": 125.70750988142292, + "max_sentence1_length": 335, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-gom": { + "num_samples": 1012, + "number_of_characters": 259182, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 37, + "average_sentence2_length": 125.70750988142292, + "max_sentence2_length": 335, + "unique_sentence2": 1012 + }, + "hne-eng": { + "num_samples": 1012, + "number_of_characters": 258911, + "unique_pairs": 1012, + "min_sentence1_length": 42, + "average_sentence1_length": 125.43972332015811, + "max_sentence1_length": 327, + "unique_sentence1": 1011, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-hne": { + "num_samples": 1012, + "number_of_characters": 258915, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 42, + "average_sentence2_length": 125.44367588932806, + "max_sentence2_length": 326, + "unique_sentence2": 1011 + }, + "raj-eng": { + "num_samples": 1012, + "number_of_characters": 261987, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 128.47924901185772, + "max_sentence1_length": 338, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-raj": { + "num_samples": 1012, + "number_of_characters": 261987, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 128.47924901185772, + "max_sentence2_length": 338, + "unique_sentence2": 1012 + }, + "mai-eng": { + "num_samples": 1012, + "number_of_characters": 261374, + "unique_pairs": 1012, + "min_sentence1_length": 36, + "average_sentence1_length": 127.87351778656127, + "max_sentence1_length": 350, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mai": { + "num_samples": 1012, + "number_of_characters": 261377, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 36, + "average_sentence2_length": 127.87648221343873, + "max_sentence2_length": 350, + "unique_sentence2": 1012 + }, + "mni-eng": { + "num_samples": 1012, + "number_of_characters": 268767, + "unique_pairs": 1012, + "min_sentence1_length": 38, + "average_sentence1_length": 135.17885375494072, + "max_sentence1_length": 353, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mni": { + "num_samples": 1012, + "number_of_characters": 268768, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 38, + "average_sentence2_length": 135.1798418972332, + "max_sentence2_length": 354, + "unique_sentence2": 1012 + }, + "mup-eng": { + "num_samples": 1012, + "number_of_characters": 262034, + "unique_pairs": 1012, + "min_sentence1_length": 40, + "average_sentence1_length": 128.52569169960475, + "max_sentence1_length": 340, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mup": { + "num_samples": 1012, + "number_of_characters": 262034, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 40, + "average_sentence2_length": 128.52569169960475, + "max_sentence2_length": 340, + "unique_sentence2": 1012 + }, + "mwr-eng": { + "num_samples": 1012, + "number_of_characters": 263749, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.22035573122528, + "max_sentence1_length": 345, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-mwr": { + "num_samples": 1012, + "number_of_characters": 263749, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.22035573122528, + "max_sentence2_length": 345, + "unique_sentence2": 1012 + }, + "sat-eng": { + "num_samples": 1012, + "number_of_characters": 271757, + "unique_pairs": 1012, + "min_sentence1_length": 43, + "average_sentence1_length": 138.13339920948616, + "max_sentence1_length": 366, + "unique_sentence1": 1012, + "min_sentence2_length": 35, + "average_sentence2_length": 130.401185770751, + "max_sentence2_length": 368, + "unique_sentence2": 1012 + }, + "eng-sat": { + "num_samples": 1012, + "number_of_characters": 271757, + "unique_pairs": 1012, + "min_sentence1_length": 35, + "average_sentence1_length": 130.401185770751, + "max_sentence1_length": 368, + "unique_sentence1": 1012, + "min_sentence2_length": 43, + "average_sentence2_length": 138.13339920948616, + "max_sentence2_length": 366, + "unique_sentence2": 1012 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/NTREXBitextMining.json b/mteb/descriptive_stats/BitextMining/NTREXBitextMining.json new file mode 100644 index 000000000..3adf27b3d --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/NTREXBitextMining.json @@ -0,0 +1,24925 @@ +{ + "test": { + "num_samples": 3826252, + "number_of_characters": 988355274, + "unique_pairs": 3820263, + "min_sentence1_length": 1, + "average_sentence1_length": 129.15449296073547, + "max_sentence1_length": 773, + "unique_sentence1": 241259, + "min_sentence2_length": 1, + "average_sentence2_length": 129.15449296073547, + "max_sentence2_length": 773, + "unique_sentence2": 241259, + "hf_subset_descriptive_stats": { + "afr_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 520490, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "afr_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 564002, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "afr_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 516072, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "afr_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 526155, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "afr_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 530560, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "afr_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 549109, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "afr_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 560267, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "afr_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 516709, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "afr_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 519796, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "afr_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 520179, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.37756634952427, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "amh_Ethi-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 415227, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "amh_Ethi-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 437473, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "amh_Ethi-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 413608, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "amh_Ethi-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 459006, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "amh_Ethi-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 404938, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "amh_Ethi-som_Latn": { + "num_samples": 1997, + "number_of_characters": 458799, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "amh_Ethi-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 455649, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "amh_Ethi-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 440016, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "amh_Ethi-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 332745, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "amh_Ethi-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 501790, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "amh_Ethi-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 407310, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "amh_Ethi-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 435597, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "amh_Ethi-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 483595, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "amh_Ethi-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 425239, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 83.87931897846771, + "max_sentence1_length": 290, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "arb_Arab-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 474983, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "arb_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 483548, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "arb_Arab-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 526831, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "arb_Arab-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 530308, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "arb_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 478901, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "arb_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 474520, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "arb_Arab-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 500981, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "arb_Arab-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 524289, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "arb_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 431477, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "arb_Arab-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 492756, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "arb_Arab-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 509557, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "arb_Arab-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 518153, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "arb_Arab-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 342807, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "arb_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 477127, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "arb_Arab-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 364586, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "arb_Arab-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 490578, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "arb_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 445016, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "arb_Arab-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 523096, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "arb_Arab-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 509047, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "arb_Arab-por_Latn": { + "num_samples": 1997, + "number_of_characters": 508396, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "arb_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 473717, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "arb_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 473814, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "arb_Arab-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 506074, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "arb_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 446094, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "arb_Arab-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 519381, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "arb_Arab-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 503690, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "arb_Arab-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 483008, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "arb_Arab-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 541142, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "arb_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 505328, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "arb_Arab-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 496794, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "arb_Arab-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 502302, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "arb_Arab-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 322659, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "arb_Arab-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 488913, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 115.76414621932899, + "max_sentence1_length": 362, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "aze_Latn-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515960, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "aze_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 517354, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "aze_Latn-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 529910, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "aze_Latn-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520498, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "aze_Latn-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515560, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "aze_Latn-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 554908, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "aze_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 535247, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "aze_Latn-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 580656, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "aze_Latn-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 563329, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 135.0195292939409, + "max_sentence1_length": 398, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "bak_Cyrl-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 515960, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "bak_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 494046, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bak_Cyrl-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 506602, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "bak_Cyrl-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 497190, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "bak_Cyrl-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 492252, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "bak_Cyrl-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 531600, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "bak_Cyrl-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 511939, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "bak_Cyrl-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 557348, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "bak_Cyrl-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 540021, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 123.34802203304957, + "max_sentence1_length": 437, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "bel_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 511000, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "bel_Cyrl-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 525979, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "bel_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 497408, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "bel_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 503810, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bel_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 512015, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "bel_Cyrl-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 523981, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "bel_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 533956, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "bel_Cyrl-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530983, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "bel_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 509059, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "bel_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 508986, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "bel_Cyrl-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 508393, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "bel_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 512231, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "bel_Cyrl-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518873, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 128.2373560340511, + "max_sentence1_length": 422, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "bem_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 546212, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bem_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 537470, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "bem_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 526972, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "bem_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 602279, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "bem_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 596231, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "bem_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 582774, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "bem_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 596822, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "bem_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 598248, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 149.47020530796195, + "max_sentence1_length": 465, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "ben_Beng-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 474983, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "ben_Beng-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 539452, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "ben_Beng-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 547650, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "ben_Beng-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 542929, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "ben_Beng-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 491522, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ben_Beng-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 519005, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "ben_Beng-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 487141, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "ben_Beng-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 513602, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "ben_Beng-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 536910, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "ben_Beng-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 488733, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "ben_Beng-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 444098, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "ben_Beng-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 505377, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "ben_Beng-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 522178, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "ben_Beng-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 530774, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "ben_Beng-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 355428, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "ben_Beng-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 509338, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "ben_Beng-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 377207, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "ben_Beng-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 503199, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "ben_Beng-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 504689, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "ben_Beng-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 492025, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "ben_Beng-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 535717, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "ben_Beng-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 494224, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "ben_Beng-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 521668, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ben_Beng-por_Latn": { + "num_samples": 1997, + "number_of_characters": 521017, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "ben_Beng-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518695, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "ben_Beng-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 502543, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "ben_Beng-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 464129, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "ben_Beng-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 532002, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "ben_Beng-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 516311, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "ben_Beng-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 495629, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "ben_Beng-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 553763, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "ben_Beng-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 491329, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "ben_Beng-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 509415, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "ben_Beng-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 491800, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "ben_Beng-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 514923, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "ben_Beng-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 335280, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "ben_Beng-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 501534, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 122.08412618928392, + "max_sentence1_length": 402, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "bod_Tibt-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 543850, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "bod_Tibt-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 548349, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bod_Tibt-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 589120, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "bod_Tibt-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 567609, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "bod_Tibt-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 559677, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "bod_Tibt-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 612483, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "bod_Tibt-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 538097, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 150.54031046569855, + "max_sentence1_length": 478, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "bos_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511000, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "bos_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 524799, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "bos_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 496228, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "bos_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 502630, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bos_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 510835, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "bos_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522801, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "bos_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 532776, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "bos_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 529803, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "bos_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 507879, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "bos_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 507806, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "bos_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 507213, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "bos_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 511051, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "bos_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 517693, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 127.64646970455684, + "max_sentence1_length": 434, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "bul_Cyrl-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 525979, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "bul_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 524799, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "bul_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 511207, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "bul_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 517609, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "bul_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 525814, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "bul_Cyrl-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 537780, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "bul_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 547755, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "bul_Cyrl-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 544782, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "bul_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 522858, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "bul_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 522785, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "bul_Cyrl-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522192, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "bul_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 526030, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "bul_Cyrl-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 532672, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 135.14722083124687, + "max_sentence1_length": 493, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "cat_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 530680, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "cat_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 576068, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "cat_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 554946, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "cat_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 572177, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "cat_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 560435, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "cat_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 560175, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "cat_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 575445, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "cat_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 571160, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 141.6925388082123, + "max_sentence1_length": 460, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "ces_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 497408, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "ces_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 496228, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "ces_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511207, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "ces_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 489038, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ces_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 497243, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "ces_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 509209, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "ces_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 519184, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ces_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 516211, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "ces_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 494287, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "ces_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 494214, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "ces_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 493621, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "ces_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 497459, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "ces_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 504101, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 120.84026039058588, + "max_sentence1_length": 474, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "ckb_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 483548, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "ckb_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 500087, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ckb_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 495706, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "ckb_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 452663, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "ckb_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 498313, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "ckb_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 466202, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "ckb_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 494903, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "ckb_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 495000, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "ckb_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 467280, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "ckb_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 526514, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 126.37305958938407, + "max_sentence1_length": 399, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "cym_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 514225, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.4526790185278, + "max_sentence1_length": 444, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "cym_Latn-gle_Latn": { + "num_samples": 1997, + "number_of_characters": 561314, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.4526790185278, + "max_sentence1_length": 444, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 147.62593890836254, + "max_sentence2_length": 461, + "unique_sentence2": 1997 + }, + "dan_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 520490, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "dan_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 547788, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "dan_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 499858, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "dan_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 509941, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "dan_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 514346, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "dan_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 532895, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "dan_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 544053, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "dan_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 500495, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "dan_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 503582, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "dan_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 503965, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 126.25838758137206, + "max_sentence1_length": 522, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "deu_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 564002, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "deu_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 526831, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "deu_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 539452, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "deu_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 547788, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "deu_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 594777, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "deu_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 543370, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "deu_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 553453, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "deu_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 538989, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "deu_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 565450, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "deu_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 588758, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "deu_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 495946, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "deu_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 557225, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "deu_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 574026, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "deu_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 582622, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "deu_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 557858, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "deu_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 407276, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "deu_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 429055, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "deu_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 555047, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "deu_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 576407, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "deu_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 587565, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "deu_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 544007, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "deu_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 547094, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "deu_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 573516, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "deu_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 572865, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "deu_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 570543, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "deu_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 583850, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "deu_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 568159, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "deu_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 547477, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "deu_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 605611, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "deu_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 561263, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "deu_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 566771, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "deu_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 387128, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "deu_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 553382, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 148.04707060590886, + "max_sentence1_length": 508, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "div_Thaa-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 547650, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "div_Thaa-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 551568, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "div_Thaa-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 579051, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "div_Thaa-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 548779, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "div_Thaa-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 565423, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "div_Thaa-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 569384, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "div_Thaa-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 564735, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "div_Thaa-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 552071, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "div_Thaa-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 554270, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "div_Thaa-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 562589, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "div_Thaa-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 524175, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "div_Thaa-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 613809, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "div_Thaa-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 551375, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "div_Thaa-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 551846, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 152.15222834251378, + "max_sentence1_length": 609, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "dzo_Tibt-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 543850, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "dzo_Tibt-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 490941, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "dzo_Tibt-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 531712, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "dzo_Tibt-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 510201, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "dzo_Tibt-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 502269, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "dzo_Tibt-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 555075, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "dzo_Tibt-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 480689, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 121.79318978467701, + "max_sentence1_length": 411, + "unique_sentence1": 1992, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "ell_Grek-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 530308, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "ell_Grek-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 542929, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "ell_Grek-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 594777, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "ell_Grek-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 546847, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ell_Grek-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 542466, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "ell_Grek-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 568927, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "ell_Grek-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 592235, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "ell_Grek-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 499423, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "ell_Grek-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 560702, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "ell_Grek-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 577503, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "ell_Grek-hye_Armn": { + "num_samples": 1997, + "number_of_characters": 563842, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 132.55633450175262, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "ell_Grek-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 586099, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "ell_Grek-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 410753, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "ell_Grek-kat_Geor": { + "num_samples": 1997, + "number_of_characters": 565719, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 133.49624436654983, + "max_sentence2_length": 503, + "unique_sentence2": 1995 + }, + "ell_Grek-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 432532, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "ell_Grek-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 558524, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "ell_Grek-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 591042, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "ell_Grek-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 576993, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ell_Grek-por_Latn": { + "num_samples": 1997, + "number_of_characters": 576342, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "ell_Grek-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 574020, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "ell_Grek-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 587327, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "ell_Grek-sqi_Latn": { + "num_samples": 1997, + "number_of_characters": 582734, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 142.01652478718077, + "max_sentence2_length": 461, + "unique_sentence2": 1996 + }, + "ell_Grek-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 571636, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "ell_Grek-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 550954, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "ell_Grek-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 609088, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "ell_Grek-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 564740, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "ell_Grek-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 570248, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "ell_Grek-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 390605, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "ell_Grek-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 556859, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 149.78818227341011, + "max_sentence1_length": 584, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "eng_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 516072, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "eng_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 415227, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "eng_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 478901, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "eng_Latn-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 517354, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "eng_Latn-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 494046, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "eng_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 503810, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "eng_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 546212, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "eng_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 491522, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "eng_Latn-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 548349, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "eng_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 502630, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "eng_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 517609, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "eng_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 530680, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "eng_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 489038, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "eng_Latn-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 500087, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "eng_Latn-cym_Latn": { + "num_samples": 1997, + "number_of_characters": 514225, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.4526790185278, + "max_sentence2_length": 444, + "unique_sentence2": 1997 + }, + "eng_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 499858, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "eng_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 543370, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "eng_Latn-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 551568, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "eng_Latn-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 490941, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "eng_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 546847, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "eng_Latn-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 522923, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "eng_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 486698, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "eng_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 505523, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "eng_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 491059, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "eng_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 548225, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "eng_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 541140, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "eng_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 517520, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "eng_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 540828, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "eng_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 476200, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "eng_Latn-gle_Latn": { + "num_samples": 1997, + "number_of_characters": 542529, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 147.62593890836254, + "max_sentence2_length": 461, + "unique_sentence2": 1997 + }, + "eng_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 519706, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "eng_Latn-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 492651, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "eng_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 517686, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "eng_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 448016, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "eng_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 509295, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "eng_Latn-hmn_Latn": { + "num_samples": 1997, + "number_of_characters": 578510, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 165.6434651977967, + "max_sentence2_length": 643, + "unique_sentence2": 1997 + }, + "eng_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 503645, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "eng_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 526096, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "eng_Latn-hye_Armn": { + "num_samples": 1997, + "number_of_characters": 512435, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 132.55633450175262, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "eng_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 493821, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "eng_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 534692, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "eng_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 509928, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "eng_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 536937, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "eng_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 359346, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "eng_Latn-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 513256, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "eng_Latn-kat_Geor": { + "num_samples": 1997, + "number_of_characters": 514312, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 133.49624436654983, + "max_sentence2_length": 503, + "unique_sentence2": 1995 + }, + "eng_Latn-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 507996, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "eng_Latn-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 536211, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "eng_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 551507, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "eng_Latn-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 498584, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "eng_Latn-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 493666, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "eng_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 381125, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "eng_Latn-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 514700, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "eng_Latn-lav_Latn": { + "num_samples": 1997, + "number_of_characters": 515908, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 134.29544316474713, + "max_sentence2_length": 503, + "unique_sentence2": 1994 + }, + "eng_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 507117, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "eng_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 528477, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "eng_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 551872, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "eng_Latn-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 508607, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "eng_Latn-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 461555, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "eng_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515611, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "eng_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 568028, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "eng_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 525195, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "eng_Latn-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 506768, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "eng_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 521844, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "eng_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 524903, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "eng_Latn-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 559574, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "eng_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 545459, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "eng_Latn-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 495943, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "eng_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 539635, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "eng_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 496077, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "eng_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 499164, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "eng_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 539219, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "eng_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 532002, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "eng_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 485151, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "eng_Latn-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 498142, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "eng_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 525586, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "eng_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 524935, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "eng_Latn-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 490256, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "eng_Latn-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 490353, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "eng_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 540205, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "eng_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522613, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "eng_Latn-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 462633, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "eng_Latn-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 506461, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "eng_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 500689, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "eng_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 500616, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "eng_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 525575, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "eng_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 546050, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "eng_Latn-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 468047, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "eng_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 539012, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "eng_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 535920, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "eng_Latn-sqi_Latn": { + "num_samples": 1997, + "number_of_characters": 531327, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 142.01652478718077, + "max_sentence2_length": 461, + "unique_sentence2": 1996 + }, + "eng_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 500023, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "eng_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 503861, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "eng_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 535862, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "eng_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 520229, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "eng_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 499547, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "eng_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 557343, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "eng_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 557681, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "eng_Latn-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 493646, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "eng_Latn-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 495247, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "eng_Latn-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 521867, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "eng_Latn-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 485188, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "eng_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 412958, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "eng_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 561360, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "eng_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 582003, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "eng_Latn-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 532994, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "eng_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 513333, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "eng_Latn-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 558742, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "eng_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 510503, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "eng_Latn-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 495718, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "eng_Latn-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 541415, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "eng_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 547476, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "eng_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 518841, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "eng_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 487523, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "eng_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 515810, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "eng_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 563808, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "eng_Latn-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 326607, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "eng_Latn-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 332681, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "eng_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 339198, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "eng_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 505452, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.04606910365548, + "max_sentence1_length": 437, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "eus_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 519005, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "eus_Latn-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 579051, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "eus_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 522923, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "eus_Latn-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 520134, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "eus_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 536778, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "eus_Latn-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 540739, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "eus_Latn-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 536090, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "eus_Latn-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 523426, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "eus_Latn-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 525625, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "eus_Latn-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 533944, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "eus_Latn-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 495530, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "eus_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 585164, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "eus_Latn-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 522730, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "eus_Latn-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 523201, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 137.80821231847773, + "max_sentence1_length": 393, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "ewe_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 537470, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "ewe_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 486698, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ewe_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 467458, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "ewe_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 542765, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "ewe_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 536717, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "ewe_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 523260, + "unique_pairs": 1995, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "ewe_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 537308, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "ewe_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 538734, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 119.6685027541312, + "max_sentence1_length": 493, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "fao_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 526155, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "fao_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 509941, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "fao_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 553453, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "fao_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 505523, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fao_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 520011, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "fao_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 538560, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "fao_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 549718, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "fao_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 506160, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "fao_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 509247, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "fao_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 509630, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0951427140711, + "max_sentence1_length": 433, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "fas_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 474520, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "fas_Arab-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 487141, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "fas_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 495706, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "fas_Arab-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 538989, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "fas_Arab-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 542466, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "fas_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 491059, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fas_Arab-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 513139, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "fas_Arab-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 536447, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "fas_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 443635, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "fas_Arab-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 504914, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "fas_Arab-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 521715, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "fas_Arab-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 530311, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "fas_Arab-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 354965, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "fas_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 489285, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "fas_Arab-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 376744, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "fas_Arab-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 502736, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "fas_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 457174, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "fas_Arab-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 535254, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "fas_Arab-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 521205, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "fas_Arab-por_Latn": { + "num_samples": 1997, + "number_of_characters": 520554, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "fas_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 485875, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "fas_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 485972, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "fas_Arab-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518232, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "fas_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 458252, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "fas_Arab-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 531539, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "fas_Arab-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 515848, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "fas_Arab-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 495166, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "fas_Arab-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 553300, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "fas_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 517486, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "fas_Arab-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 508952, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "fas_Arab-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 514460, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "fas_Arab-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 334817, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "fas_Arab-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 501071, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 121.85227841762644, + "max_sentence1_length": 389, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "fij_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 548225, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fij_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 593925, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "fij_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 587477, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "fij_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 604657, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "fij_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 620813, + "unique_pairs": 1995, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "fij_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 574629, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "fij_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 577688, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "fij_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 578360, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "fij_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 610128, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "fij_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 614145, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.478217325989, + "max_sentence1_length": 448, + "unique_sentence1": 1988, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "fil_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 541140, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fil_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 593925, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "fil_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 580392, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "fil_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 597572, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "fil_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 613728, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "fil_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 567544, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "fil_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 570603, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "fil_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 571275, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "fil_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 603043, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "fil_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 607060, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 146.93039559339007, + "max_sentence1_length": 554, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "fin_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 500981, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "fin_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 513602, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "fin_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 565450, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "fin_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 568927, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "fin_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 517520, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fin_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 513139, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "fin_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 562908, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "fin_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 470096, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "fin_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 531375, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "fin_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 548176, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "fin_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 556772, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "fin_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 381426, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "fin_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 403205, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "fin_Latn-lav_Latn": { + "num_samples": 1997, + "number_of_characters": 537988, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.29544316474713, + "max_sentence2_length": 503, + "unique_sentence2": 1994 + }, + "fin_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 529197, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "fin_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 561715, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "fin_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 547666, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "fin_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 547015, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "fin_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 544693, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "fin_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 558000, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "fin_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 542309, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "fin_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 521627, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "fin_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 579761, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "fin_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 535413, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "fin_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 540921, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "fin_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 361278, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "fin_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 527532, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.10265398097147, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "fra_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 524289, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "fra_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 536910, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "fra_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 576068, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "fra_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 588758, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "fra_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 592235, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "fra_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 540828, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fra_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 536447, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "fra_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 562908, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "fra_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 565094, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "fra_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 493404, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "fra_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 554683, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "fra_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 571484, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "fra_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 580080, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "fra_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 582325, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "fra_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 404734, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "fra_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 426513, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "fra_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 552505, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "fra_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 570583, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "fra_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 585023, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "fra_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 570974, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "fra_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 570323, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "fra_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 585593, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "fra_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 568001, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "fra_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 581308, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "fra_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 565617, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "fra_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 544935, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "fra_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 603069, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "fra_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 558721, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "fra_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 564229, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "fra_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 384586, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "fra_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 550840, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.7741612418628, + "max_sentence1_length": 512, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "fuc_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 526972, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "fuc_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 476200, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "fuc_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 467458, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "fuc_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 532267, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "fuc_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 526219, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "fuc_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 512762, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "fuc_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 526810, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "fuc_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 528236, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 114.4116174261392, + "max_sentence1_length": 376, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "gle_Latn-cym_Latn": { + "num_samples": 1997, + "number_of_characters": 561314, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 147.62593890836254, + "max_sentence1_length": 461, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.4526790185278, + "max_sentence2_length": 444, + "unique_sentence2": 1997 + }, + "gle_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 542529, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 147.62593890836254, + "max_sentence1_length": 461, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "glg_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 554946, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "glg_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 519706, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "glg_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 565094, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "glg_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 561203, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "glg_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 549461, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "glg_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 549201, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "glg_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 564471, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "glg_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 560186, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 136.19729594391586, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "guj_Gujr-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 488733, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "guj_Gujr-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 548779, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "guj_Gujr-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 492651, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "guj_Gujr-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 520134, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "guj_Gujr-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 506506, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "guj_Gujr-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 510467, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "guj_Gujr-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 505818, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "guj_Gujr-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 493154, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "guj_Gujr-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 495353, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "guj_Gujr-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 503672, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "guj_Gujr-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 465258, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "guj_Gujr-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 554892, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "guj_Gujr-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 492458, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "guj_Gujr-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 492929, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 122.64947421131697, + "max_sentence1_length": 378, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "hau_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 437473, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "hau_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 517686, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hau_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 516067, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "hau_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 561465, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "hau_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 507397, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "hau_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 561258, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "hau_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 558108, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "hau_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 542475, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "hau_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 435204, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "hau_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 604249, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "hau_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 509769, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "hau_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 538056, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "hau_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 586054, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "hau_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 527698, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 135.185778668002, + "max_sentence1_length": 483, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "heb_Hebr-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 431477, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "heb_Hebr-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 444098, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "heb_Hebr-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 452663, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "heb_Hebr-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 495946, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "heb_Hebr-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 499423, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "heb_Hebr-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 448016, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "heb_Hebr-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 443635, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "heb_Hebr-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 470096, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "heb_Hebr-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 493404, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "heb_Hebr-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 461871, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "heb_Hebr-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 478672, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "heb_Hebr-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 487268, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "heb_Hebr-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 311922, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "heb_Hebr-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 446242, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "heb_Hebr-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 333701, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "heb_Hebr-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 459693, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "heb_Hebr-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 414131, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "heb_Hebr-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 492211, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "heb_Hebr-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 478162, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "heb_Hebr-por_Latn": { + "num_samples": 1997, + "number_of_characters": 477511, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "heb_Hebr-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 442832, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "heb_Hebr-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 442929, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "heb_Hebr-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 475189, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "heb_Hebr-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 415209, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "heb_Hebr-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 488496, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "heb_Hebr-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 472805, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "heb_Hebr-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 452123, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "heb_Hebr-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 510257, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "heb_Hebr-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 474443, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "heb_Hebr-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 465909, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "heb_Hebr-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 471417, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "heb_Hebr-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 291774, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "heb_Hebr-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 458028, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 100.29844767150726, + "max_sentence1_length": 375, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "hin_Deva-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 492756, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "hin_Deva-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 505377, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "hin_Deva-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 557225, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "hin_Deva-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 565423, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "hin_Deva-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 560702, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "hin_Deva-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 509295, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hin_Deva-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 536778, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "hin_Deva-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 504914, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "hin_Deva-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 531375, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "hin_Deva-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 554683, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "hin_Deva-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 506506, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "hin_Deva-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 461871, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "hin_Deva-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 539951, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "hin_Deva-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 548547, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "hin_Deva-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 373201, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "hin_Deva-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 527111, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "hin_Deva-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 394980, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "hin_Deva-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 520972, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "hin_Deva-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 522462, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "hin_Deva-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 509798, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "hin_Deva-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 553490, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "hin_Deva-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 511997, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "hin_Deva-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 539441, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "hin_Deva-por_Latn": { + "num_samples": 1997, + "number_of_characters": 538790, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "hin_Deva-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 536468, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "hin_Deva-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 520316, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "hin_Deva-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 481902, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "hin_Deva-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 549775, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "hin_Deva-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 534084, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "hin_Deva-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 513402, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "hin_Deva-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 571536, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "hin_Deva-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 509102, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "hin_Deva-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 527188, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "hin_Deva-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 509573, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "hin_Deva-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 532696, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "hin_Deva-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 353053, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "hin_Deva-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 519307, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 130.9839759639459, + "max_sentence1_length": 394, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "hmn_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 578510, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 165.6434651977967, + "max_sentence1_length": 643, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hrv_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 512015, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "hrv_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 510835, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "hrv_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 525814, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "hrv_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 497243, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "hrv_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 503645, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hrv_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 523816, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "hrv_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 533791, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "hrv_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530818, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "hrv_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 508894, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "hrv_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 508821, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "hrv_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 508228, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "hrv_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 512066, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "hrv_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518708, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 128.1547320981472, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "hun_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 509557, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "hun_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 522178, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "hun_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 574026, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "hun_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 577503, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "hun_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 526096, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hun_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 521715, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "hun_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 548176, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "hun_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 571484, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "hun_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 478672, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "hun_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 539951, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "hun_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 565348, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "hun_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 390002, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "hun_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 411781, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "hun_Latn-lav_Latn": { + "num_samples": 1997, + "number_of_characters": 546564, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 134.29544316474713, + "max_sentence2_length": 503, + "unique_sentence2": 1994 + }, + "hun_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 537773, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "hun_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 570291, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "hun_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 556242, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "hun_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 555591, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "hun_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 553269, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "hun_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 566576, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "hun_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 550885, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "hun_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 530203, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "hun_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 588337, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "hun_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 543989, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "hun_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 549497, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "hun_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 369854, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "hun_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 536108, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 139.3970956434652, + "max_sentence1_length": 508, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "hye_Armn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 563842, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 132.55633450175262, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "hye_Armn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 512435, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 132.55633450175262, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "hye_Armn-kat_Geor": { + "num_samples": 1997, + "number_of_characters": 531307, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 132.55633450175262, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 133.49624436654983, + "max_sentence2_length": 503, + "unique_sentence2": 1995 + }, + "hye_Armn-sqi_Latn": { + "num_samples": 1997, + "number_of_characters": 548322, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 132.55633450175262, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 142.01652478718077, + "max_sentence2_length": 461, + "unique_sentence2": 1996 + }, + "ibo_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 413608, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "ibo_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 493821, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ibo_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 516067, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "ibo_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 537600, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "ibo_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 483532, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "ibo_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 537393, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "ibo_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 534243, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "ibo_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 518610, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "ibo_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 411339, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "ibo_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 580384, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "ibo_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 485904, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "ibo_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 514191, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "ibo_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 562189, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "ibo_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 503833, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 123.23535302954431, + "max_sentence1_length": 469, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "ind_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 518153, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "ind_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 530774, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "ind_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 582622, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "ind_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 586099, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "ind_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 534692, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ind_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 530311, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "ind_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 587477, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "ind_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 580392, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "ind_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 556772, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "ind_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 580080, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "ind_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 487268, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "ind_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 548547, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "ind_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 565348, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "ind_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 398598, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "ind_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 420377, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "ind_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 546369, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "ind_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 591124, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "ind_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 607280, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "ind_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 561096, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "ind_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 564155, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "ind_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 578887, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "ind_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 564838, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ind_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 564187, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "ind_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 561865, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "ind_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 564827, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "ind_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 575172, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "ind_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 559481, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "ind_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 538799, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "ind_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 596595, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "ind_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 596933, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "ind_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 600612, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "ind_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 552585, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "ind_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 558093, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "ind_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 378450, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "ind_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 544704, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 143.70155232849274, + "max_sentence1_length": 486, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "isl_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 530560, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "isl_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 514346, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "isl_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 557858, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "isl_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 509928, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "isl_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 520011, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "isl_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 542965, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "isl_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 554123, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "isl_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 510565, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "isl_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 513652, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "isl_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 514035, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 131.30095142714072, + "max_sentence1_length": 399, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "ita_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 572177, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "ita_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 536937, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ita_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 582325, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "ita_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 561203, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ita_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 566692, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "ita_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 566432, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "ita_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 581702, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "ita_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 577417, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 144.82573860791186, + "max_sentence1_length": 623, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "jpn_Jpan-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 342807, + "unique_pairs": 1995, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "jpn_Jpan-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 355428, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "jpn_Jpan-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 407276, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "jpn_Jpan-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 410753, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "jpn_Jpan-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 359346, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "jpn_Jpan-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 354965, + "unique_pairs": 1995, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "jpn_Jpan-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 381426, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "jpn_Jpan-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 404734, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "jpn_Jpan-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 311922, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "jpn_Jpan-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 373201, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "jpn_Jpan-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 390002, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "jpn_Jpan-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 398598, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "jpn_Jpan-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 245031, + "unique_pairs": 1995, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "jpn_Jpan-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 371023, + "unique_pairs": 1995, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "jpn_Jpan-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 403541, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "jpn_Jpan-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 389492, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "jpn_Jpan-por_Latn": { + "num_samples": 1997, + "number_of_characters": 388841, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "jpn_Jpan-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 386519, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "jpn_Jpan-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 399826, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "jpn_Jpan-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 384135, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "jpn_Jpan-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 363453, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "jpn_Jpan-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 421587, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "jpn_Jpan-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 377239, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "jpn_Jpan-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 382747, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "jpn_Jpan-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 190513, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "jpn_Jpan-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 196587, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "jpn_Jpan-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 203104, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "jpn_Jpan-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 369358, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 55.89684526790185, + "max_sentence1_length": 189, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "kan_Knda-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 509338, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "kan_Knda-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 569384, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "kan_Knda-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 513256, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kan_Knda-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 540739, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "kan_Knda-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 510467, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "kan_Knda-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 527111, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "kan_Knda-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 526423, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "kan_Knda-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 513759, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "kan_Knda-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 515958, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "kan_Knda-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 524277, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "kan_Knda-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 485863, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "kan_Knda-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 575497, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "kan_Knda-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 513063, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "kan_Knda-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 513534, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 132.96745117676514, + "max_sentence1_length": 449, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "kat_Geor-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 565719, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 133.49624436654983, + "max_sentence1_length": 503, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "kat_Geor-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 514312, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 133.49624436654983, + "max_sentence1_length": 503, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kat_Geor-hye_Armn": { + "num_samples": 1997, + "number_of_characters": 531307, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 133.49624436654983, + "max_sentence1_length": 503, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 132.55633450175262, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "kat_Geor-sqi_Latn": { + "num_samples": 1997, + "number_of_characters": 550199, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 133.49624436654983, + "max_sentence1_length": 503, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 142.01652478718077, + "max_sentence2_length": 461, + "unique_sentence2": 1996 + }, + "kaz_Cyrl-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 529910, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "kaz_Cyrl-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 506602, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "kaz_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 507996, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kaz_Cyrl-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511140, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "kaz_Cyrl-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 506202, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "kaz_Cyrl-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 545550, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "kaz_Cyrl-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 525889, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "kaz_Cyrl-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 571298, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "kaz_Cyrl-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 553971, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 130.33350025037555, + "max_sentence1_length": 473, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "khm_Khmr-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 589120, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "khm_Khmr-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 531712, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "khm_Khmr-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 536211, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "khm_Khmr-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 555471, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "khm_Khmr-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 547539, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "khm_Khmr-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 600345, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "khm_Khmr-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 525959, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 144.4621932899349, + "max_sentence1_length": 517, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "kin_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 602279, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "kin_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 551507, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kin_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 542765, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "kin_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 532267, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "kin_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 601526, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "kin_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 588069, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "kin_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 602117, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "kin_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 603543, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 152.12168252378567, + "max_sentence1_length": 541, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "kir_Cyrl-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 520498, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "kir_Cyrl-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 497190, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "kir_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 498584, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kir_Cyrl-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511140, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "kir_Cyrl-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 496790, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "kir_Cyrl-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 536138, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "kir_Cyrl-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 516477, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "kir_Cyrl-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 561886, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "kir_Cyrl-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 544559, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.62043064596895, + "max_sentence1_length": 395, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "kmr_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 477127, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "kmr_Latn-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 498313, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "kmr_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 493666, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kmr_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 489285, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "kmr_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 446242, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "kmr_Latn-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 459781, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "kmr_Latn-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 488482, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "kmr_Latn-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 488579, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "kmr_Latn-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 460859, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "kmr_Latn-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520093, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.15773660490736, + "max_sentence1_length": 420, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "kor_Hang-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 364586, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "kor_Hang-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 377207, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "kor_Hang-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 429055, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "kor_Hang-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 432532, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "kor_Hang-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 381125, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "kor_Hang-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 376744, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "kor_Hang-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 403205, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "kor_Hang-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 426513, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "kor_Hang-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 333701, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "kor_Hang-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 394980, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "kor_Hang-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 411781, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "kor_Hang-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 420377, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "kor_Hang-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 245031, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "kor_Hang-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 392802, + "unique_pairs": 1995, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "kor_Hang-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 425320, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "kor_Hang-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 411271, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "kor_Hang-por_Latn": { + "num_samples": 1997, + "number_of_characters": 410620, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "kor_Hang-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 408298, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "kor_Hang-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 421605, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "kor_Hang-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 405914, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "kor_Hang-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 385232, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "kor_Hang-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 443366, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "kor_Hang-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 399018, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "kor_Hang-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 404526, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "kor_Hang-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 212292, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "kor_Hang-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 218366, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "kor_Hang-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 224883, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "kor_Hang-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 391137, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 66.80270405608412, + "max_sentence1_length": 217, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "lao_Laoo-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 567609, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "lao_Laoo-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 510201, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "lao_Laoo-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 514700, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "lao_Laoo-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 555471, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "lao_Laoo-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 526028, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "lao_Laoo-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 578834, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "lao_Laoo-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 504448, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 133.69053580370556, + "max_sentence1_length": 507, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "lav_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 515908, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 134.29544316474713, + "max_sentence1_length": 503, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "lav_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 537988, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 134.29544316474713, + "max_sentence1_length": 503, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "lav_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 546564, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 134.29544316474713, + "max_sentence1_length": 503, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "lav_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 527585, + "unique_pairs": 1995, + "min_sentence1_length": 7, + "average_sentence1_length": 134.29544316474713, + "max_sentence1_length": 503, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "lit_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 490578, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "lit_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 503199, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "lit_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 555047, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "lit_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 558524, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "lit_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 507117, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "lit_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 502736, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "lit_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 529197, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "lit_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 552505, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "lit_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 459693, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "lit_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 520972, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "lit_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 537773, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "lit_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 546369, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "lit_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 371023, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "lit_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 392802, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "lit_Latn-lav_Latn": { + "num_samples": 1997, + "number_of_characters": 527585, + "unique_pairs": 1995, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 134.29544316474713, + "max_sentence2_length": 503, + "unique_sentence2": 1994 + }, + "lit_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 551312, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "lit_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 537263, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "lit_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 536612, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "lit_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 534290, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "lit_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 547597, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "lit_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 531906, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "lit_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 511224, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "lit_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 569358, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "lit_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 525010, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "lit_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 530518, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "lit_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 350875, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "lit_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 517129, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 129.893340010015, + "max_sentence1_length": 446, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "ltz_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 549109, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "ltz_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 532895, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "ltz_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 576407, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "ltz_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 528477, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ltz_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 538560, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "ltz_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 542965, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "ltz_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 572672, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "ltz_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 529114, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "ltz_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 532201, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "ltz_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 532584, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 140.58938407611416, + "max_sentence1_length": 543, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "mal_Mlym-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 551872, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mal_Mlym-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 604657, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "mal_Mlym-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 597572, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "mal_Mlym-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 591124, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "mal_Mlym-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 624460, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "mal_Mlym-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 578276, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "mal_Mlym-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 581335, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "mal_Mlym-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 582007, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "mal_Mlym-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 613775, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "mal_Mlym-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 617792, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 152.30445668502753, + "max_sentence1_length": 540, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "mar_Deva-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 504689, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "mar_Deva-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 564735, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "mar_Deva-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 508607, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mar_Deva-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 536090, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "mar_Deva-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 505818, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "mar_Deva-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 522462, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "mar_Deva-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 526423, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "mar_Deva-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 509110, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "mar_Deva-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 511309, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "mar_Deva-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 519628, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "mar_Deva-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 481214, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "mar_Deva-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 570848, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "mar_Deva-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 508414, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "mar_Deva-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 508885, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 130.63945918878318, + "max_sentence1_length": 443, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "mey_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 445016, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "mey_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 466202, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "mey_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 461555, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mey_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 457174, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "mey_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 414131, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "mey_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 459781, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "mey_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 456371, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "mey_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 456468, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "mey_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 428748, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "mey_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 487982, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 107.07811717576365, + "max_sentence1_length": 392, + "unique_sentence1": 1993, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "mkd_Cyrl-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 523981, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 522801, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 537780, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 509209, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "mkd_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 515611, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mkd_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 523816, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "mkd_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 545757, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 542784, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 520860, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 520787, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520194, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "mkd_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 524032, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "mkd_Cyrl-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530674, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.1467200801202, + "max_sentence1_length": 451, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "mlg_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 568028, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mlg_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 620813, + "unique_pairs": 1995, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "mlg_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 613728, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "mlg_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 607280, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "mlg_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 624460, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "mlg_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 594432, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "mlg_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 597491, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "mlg_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 598163, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "mlg_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 629931, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "mlg_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 633948, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 160.39459188783175, + "max_sentence1_length": 559, + "unique_sentence1": 1994, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "mlt_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 560435, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "mlt_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 525195, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mlt_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 570583, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "mlt_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 549461, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "mlt_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 566692, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "mlt_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 554690, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "mlt_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 569960, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "mlt_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 565675, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 138.94591887831749, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "mon_Mong-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 559677, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "mon_Mong-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 502269, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "mon_Mong-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 506768, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mon_Mong-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 547539, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "mon_Mong-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 526028, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "mon_Mong-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 570902, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "mon_Mong-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 496516, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 129.7185778668002, + "max_sentence1_length": 414, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "mri_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 521844, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mri_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 574629, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "mri_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 567544, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "mri_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 561096, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "mri_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 578276, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "mri_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 594432, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "mri_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 551307, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "mri_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 551979, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "mri_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 583747, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "mri_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 587764, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 137.26790185277918, + "max_sentence1_length": 443, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "msa_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 524903, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "msa_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 577688, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "msa_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 570603, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "msa_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 564155, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "msa_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 581335, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "msa_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 597491, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "msa_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 551307, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "msa_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 555038, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "msa_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 586806, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "msa_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 590823, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 138.79969954932398, + "max_sentence1_length": 463, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "mya_Mymr-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 612483, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "mya_Mymr-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 555075, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "mya_Mymr-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 559574, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "mya_Mymr-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 600345, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "mya_Mymr-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 578834, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "mya_Mymr-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 570902, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "mya_Mymr-tha_Thai": { + "num_samples": 1997, + "number_of_characters": 549322, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 156.16124186279418, + "max_sentence1_length": 773, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 118.91236855282925, + "max_sentence2_length": 439, + "unique_sentence2": 1996 + }, + "nde_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 596231, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "nde_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 545459, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nde_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 536717, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "nde_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 526219, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "nde_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 601526, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "nde_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 582021, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "nde_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 596069, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "nde_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 597495, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.09313970956435, + "max_sentence1_length": 590, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "nep_Deva-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 492025, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "nep_Deva-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 552071, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "nep_Deva-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 495943, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nep_Deva-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 523426, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "nep_Deva-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 493154, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "nep_Deva-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 509798, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "nep_Deva-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 513759, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "nep_Deva-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 509110, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "nep_Deva-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 498645, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "nep_Deva-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 506964, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "nep_Deva-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 468550, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "nep_Deva-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 558184, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "nep_Deva-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 495750, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "nep_Deva-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 496221, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 124.29794692038057, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "nld_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 560267, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "nld_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 523096, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "nld_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 535717, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "nld_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 544053, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "nld_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 587565, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "nld_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 591042, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "nld_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 539635, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nld_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 549718, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "nld_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 535254, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "nld_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 561715, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "nld_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 585023, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "nld_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 492211, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "nld_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 553490, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "nld_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 570291, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "nld_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 578887, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "nld_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 554123, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "nld_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 403541, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "nld_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 425320, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "nld_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 551312, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "nld_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 572672, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "nld_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 540272, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "nld_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 543359, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "nld_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 569781, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "nld_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 569130, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "nld_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 566808, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "nld_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 580115, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "nld_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 564424, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "nld_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 543742, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "nld_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 601876, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "nld_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 557528, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "nld_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 563036, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "nld_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 383393, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "nld_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 549647, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 146.1767651477216, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "nno_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 516709, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "nno_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 500495, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "nno_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 544007, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "nno_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 496077, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nno_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 506160, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "nno_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 510565, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "nno_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 529114, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "nno_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 540272, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "nno_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 499801, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "nno_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 500184, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.36504757135704, + "max_sentence1_length": 417, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "nob_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 519796, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "nob_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 503582, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "nob_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 547094, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "nob_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 499164, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nob_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 509247, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "nob_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 513652, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "nob_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 532201, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "nob_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 543359, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "nob_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 499801, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "nob_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 503271, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 125.91086629944917, + "max_sentence1_length": 482, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "nso_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 459006, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "nso_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 539219, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nso_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 561465, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "nso_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 537600, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "nso_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 528930, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "nso_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 582791, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "nso_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 579641, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "nso_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 564008, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "nso_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 456737, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "nso_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 625782, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "nso_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 531302, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "nso_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 559589, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "nso_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 607587, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "nso_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 549231, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 145.96845267901853, + "max_sentence1_length": 487, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "nya_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 582774, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "nya_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 532002, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "nya_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 523260, + "unique_pairs": 1995, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "nya_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 512762, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "nya_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 588069, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "nya_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 582021, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "nya_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 582612, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "nya_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 584038, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 142.35453179769655, + "max_sentence1_length": 464, + "unique_sentence1": 1993, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "orm_Ethi-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 404938, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "orm_Ethi-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 485151, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "orm_Ethi-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 507397, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "orm_Ethi-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 483532, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "orm_Ethi-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 528930, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "orm_Ethi-som_Latn": { + "num_samples": 1997, + "number_of_characters": 528723, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "orm_Ethi-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 525573, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "orm_Ethi-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 509940, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "orm_Ethi-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 402669, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "orm_Ethi-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 571714, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "orm_Ethi-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 477234, + "unique_pairs": 1992, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "orm_Ethi-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 505521, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "orm_Ethi-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 553519, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "orm_Ethi-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 495163, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 118.89384076114172, + "max_sentence1_length": 466, + "unique_sentence1": 1984, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "pan_Guru-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 494224, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "pan_Guru-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 554270, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "pan_Guru-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 498142, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "pan_Guru-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 525625, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "pan_Guru-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 495353, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "pan_Guru-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 511997, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "pan_Guru-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 515958, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "pan_Guru-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 511309, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "pan_Guru-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 498645, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "pan_Guru-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 509163, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "pan_Guru-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 470749, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "pan_Guru-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 560383, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "pan_Guru-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 497949, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "pan_Guru-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 498420, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 125.39909864797195, + "max_sentence1_length": 383, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "pol_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 509047, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "pol_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 533956, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "pol_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 521668, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "pol_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 532776, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "pol_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 547755, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "pol_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 519184, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "pol_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 573516, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "pol_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 576993, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "pol_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 525586, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "pol_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 521205, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "pol_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 547666, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "pol_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 570974, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "pol_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 478162, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "pol_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 539441, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "pol_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 533791, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "pol_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 556242, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "pol_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 564838, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "pol_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 389492, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "pol_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 411271, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "pol_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 537263, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "pol_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 545757, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "pol_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 569781, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "pol_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 555081, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "pol_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 552759, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "pol_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 530835, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "pol_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 530762, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "pol_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 566066, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "pol_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530169, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "pol_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 534007, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "pol_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 550375, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "pol_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 529693, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "pol_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 587827, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "pol_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 543479, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "pol_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 540649, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "pol_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 548987, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "pol_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 369344, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "pol_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 535598, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 139.14171256885328, + "max_sentence1_length": 468, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "por_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 508396, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "por_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 521017, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "por_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 560175, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "por_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 572865, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "por_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 576342, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "por_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 524935, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "por_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 520554, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "por_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 547015, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "por_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 570323, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "por_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 549201, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "por_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 477511, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "por_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 538790, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "por_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 555591, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "por_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 564187, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "por_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 566432, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "por_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 388841, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "por_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 410620, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "por_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 536612, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "por_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 554690, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "por_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 569130, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "por_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 555081, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "por_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 569700, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "por_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 552108, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "por_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 565415, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "por_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 549724, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "por_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 529042, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "por_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 587176, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "por_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 542828, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "por_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 548336, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "por_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 368693, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "por_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 534947, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 138.81572358537807, + "max_sentence1_length": 497, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "prs_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 473717, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "prs_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 494903, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "prs_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 490256, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "prs_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 485875, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "prs_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 442832, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "prs_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 488482, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "prs_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 456371, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "prs_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 485169, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "prs_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 457449, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "prs_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 516683, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.45017526289435, + "max_sentence1_length": 365, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "pus_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 473814, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "pus_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 495000, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "pus_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 490353, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "pus_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 485972, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "pus_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 442929, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "pus_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 488579, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "pus_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 456468, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "pus_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 485169, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "pus_Arab-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 457546, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "pus_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 516780, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 121.49874812218327, + "max_sentence1_length": 366, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "ron_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 575445, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "ron_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 540205, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ron_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 585593, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "ron_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 564471, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ron_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 581702, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "ron_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 569960, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "ron_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 569700, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "ron_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 580685, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 146.4621932899349, + "max_sentence1_length": 518, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "rus_Cyrl-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 506074, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "rus_Cyrl-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530983, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "rus_Cyrl-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 518695, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "rus_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 529803, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "rus_Cyrl-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 544782, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "rus_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 516211, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "rus_Cyrl-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 570543, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "rus_Cyrl-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 574020, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "rus_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 522613, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "rus_Cyrl-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 518232, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "rus_Cyrl-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 544693, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "rus_Cyrl-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 568001, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "rus_Cyrl-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 475189, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "rus_Cyrl-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 536468, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "rus_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 530818, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "rus_Cyrl-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 553269, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "rus_Cyrl-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 561865, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "rus_Cyrl-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 386519, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "rus_Cyrl-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 408298, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "rus_Cyrl-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 534290, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "rus_Cyrl-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 542784, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "rus_Cyrl-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 566808, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "rus_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 552759, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "rus_Cyrl-por_Latn": { + "num_samples": 1997, + "number_of_characters": 552108, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "rus_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 527862, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "rus_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 527789, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "rus_Cyrl-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 563093, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "rus_Cyrl-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 527196, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "rus_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 531034, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "rus_Cyrl-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 547402, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "rus_Cyrl-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 526720, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "rus_Cyrl-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 584854, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "rus_Cyrl-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 540506, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "rus_Cyrl-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 537676, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "rus_Cyrl-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 546014, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "rus_Cyrl-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 366371, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "rus_Cyrl-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 532625, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 137.6529794692038, + "max_sentence1_length": 419, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "shi_Arab-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 446094, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "shi_Arab-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 467280, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "shi_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 462633, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "shi_Arab-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 458252, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "shi_Arab-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 415209, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "shi_Arab-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 460859, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "shi_Arab-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 428748, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "shi_Arab-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 457449, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "shi_Arab-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 457546, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "shi_Arab-tgk_Cyrl": { + "num_samples": 1997, + "number_of_characters": 489060, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 107.6179268903355, + "max_sentence1_length": 378, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 137.27941912869304, + "max_sentence2_length": 451, + "unique_sentence2": 1995 + }, + "sin_Sinh-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 502543, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "sin_Sinh-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 562589, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "sin_Sinh-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 506461, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "sin_Sinh-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 533944, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "sin_Sinh-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 503672, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "sin_Sinh-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 520316, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "sin_Sinh-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 524277, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "sin_Sinh-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 519628, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "sin_Sinh-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 506964, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "sin_Sinh-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 509163, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "sin_Sinh-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 479068, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "sin_Sinh-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 568702, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "sin_Sinh-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 506268, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "sin_Sinh-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 506739, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 129.56484727090637, + "max_sentence1_length": 441, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "slk_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 509059, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "slk_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 507879, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "slk_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522858, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "slk_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 494287, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "slk_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 500689, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "slk_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 508894, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "slk_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520860, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "slk_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 530835, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "slk_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 527862, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "slk_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 505865, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "slk_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 505272, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "slk_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 509110, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "slk_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515752, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 126.67451176765148, + "max_sentence1_length": 403, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "slv_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 508986, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "slv_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 507806, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "slv_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522785, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "slv_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 494214, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "slv_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 500616, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "slv_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 508821, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "slv_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520787, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "slv_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 530762, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "slv_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 527789, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "slv_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 505865, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "slv_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 505199, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "slv_Latn-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 509037, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "slv_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515679, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.63795693540311, + "max_sentence1_length": 463, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "smo_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 525575, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "smo_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 578360, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "smo_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 571275, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "smo_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 564827, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "smo_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 582007, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "smo_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 598163, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "smo_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 551979, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "smo_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 555038, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "smo_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 587478, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "smo_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 591495, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 139.1362043064597, + "max_sentence1_length": 431, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "sna_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 596822, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "sna_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 546050, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "sna_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 537308, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "sna_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 526810, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "sna_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 602117, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "sna_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 596069, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "sna_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 582612, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "sna_Latn-ven_Latn": { + "num_samples": 1997, + "number_of_characters": 598086, + "unique_pairs": 1995, + "min_sentence1_length": 6, + "average_sentence1_length": 149.38908362543816, + "max_sentence1_length": 511, + "unique_sentence1": 1995, + "min_sentence2_length": 10, + "average_sentence2_length": 150.10315473209815, + "max_sentence2_length": 535, + "unique_sentence2": 1993 + }, + "snd_Arab-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 464129, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "snd_Arab-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 524175, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "snd_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 468047, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "snd_Arab-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 495530, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "snd_Arab-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 465258, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "snd_Arab-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 481902, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "snd_Arab-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 485863, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "snd_Arab-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 481214, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "snd_Arab-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 468550, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "snd_Arab-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 470749, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "snd_Arab-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 479068, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "snd_Arab-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 530288, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "snd_Arab-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 467854, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "snd_Arab-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 468325, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 110.32899349023535, + "max_sentence1_length": 335, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "som_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 458799, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "som_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 539012, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "som_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 561258, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "som_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 537393, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "som_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 582791, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "som_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 528723, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "som_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 579434, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "som_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 563801, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "som_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 456530, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "som_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 625575, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "som_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 531095, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "som_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 559382, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "som_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 607380, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "som_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 549024, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 145.8647971957937, + "max_sentence1_length": 455, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "spa_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 519381, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "spa_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 532002, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "spa_Latn-cat_Latn": { + "num_samples": 1997, + "number_of_characters": 571160, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 141.6925388082123, + "max_sentence2_length": 460, + "unique_sentence2": 1997 + }, + "spa_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 583850, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "spa_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 587327, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "spa_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 535920, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "spa_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 531539, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "spa_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 558000, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "spa_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 581308, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "spa_Latn-glg_Latn": { + "num_samples": 1997, + "number_of_characters": 560186, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 136.19729594391586, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "spa_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 488496, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "spa_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 549775, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "spa_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 566576, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "spa_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 575172, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "spa_Latn-ita_Latn": { + "num_samples": 1997, + "number_of_characters": 577417, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 144.82573860791186, + "max_sentence2_length": 623, + "unique_sentence2": 1996 + }, + "spa_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 399826, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "spa_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 421605, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "spa_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 547597, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "spa_Latn-mlt_Latn": { + "num_samples": 1997, + "number_of_characters": 565675, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 138.94591887831749, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "spa_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 580115, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "spa_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 566066, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "spa_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 565415, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "spa_Latn-ron_Latn": { + "num_samples": 1997, + "number_of_characters": 580685, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 146.4621932899349, + "max_sentence2_length": 518, + "unique_sentence2": 1997 + }, + "spa_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 563093, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "spa_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 560709, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "spa_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 540027, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "spa_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 598161, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "spa_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 553813, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "spa_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 559321, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "spa_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 379678, + "unique_pairs": 1996, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "spa_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 545932, + "unique_pairs": 1997, + "min_sentence1_length": 1, + "average_sentence1_length": 144.3164747120681, + "max_sentence1_length": 504, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "sqi_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 582734, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 142.01652478718077, + "max_sentence1_length": 461, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "sqi_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 531327, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 142.01652478718077, + "max_sentence1_length": 461, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "sqi_Latn-hye_Armn": { + "num_samples": 1997, + "number_of_characters": 548322, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 142.01652478718077, + "max_sentence1_length": 461, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 132.55633450175262, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "sqi_Latn-kat_Geor": { + "num_samples": 1997, + "number_of_characters": 550199, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 142.01652478718077, + "max_sentence1_length": 461, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 133.49624436654983, + "max_sentence2_length": 503, + "unique_sentence2": 1995 + }, + "srp_Cyrl-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 508393, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "srp_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 507213, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "srp_Cyrl-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 522192, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "srp_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 493621, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "srp_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 500023, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "srp_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 508228, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "srp_Cyrl-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 520194, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "srp_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 530169, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "srp_Cyrl-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 527196, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "srp_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 505272, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "srp_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 505199, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "srp_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 508444, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "srp_Cyrl-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515086, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 126.34101151727592, + "max_sentence1_length": 439, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "srp_Latn-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 512231, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "srp_Latn-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 511051, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "srp_Latn-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 526030, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "srp_Latn-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 497459, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "srp_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 503861, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "srp_Latn-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 512066, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "srp_Latn-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 524032, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "srp_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 534007, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "srp_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 531034, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "srp_Latn-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 509110, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "srp_Latn-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 509037, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "srp_Latn-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 508444, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "srp_Latn-ukr_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518924, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 128.26289434151226, + "max_sentence1_length": 452, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 131.58888332498748, + "max_sentence2_length": 440, + "unique_sentence2": 1996 + }, + "ssw_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 455649, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "ssw_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 535862, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ssw_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 558108, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "ssw_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 534243, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "ssw_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 579641, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "ssw_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 525573, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "ssw_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 579434, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "ssw_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 560651, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "ssw_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 453380, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "ssw_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 622425, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "ssw_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 527945, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "ssw_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 556232, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "ssw_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 604230, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "ssw_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 545874, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 144.28743114672008, + "max_sentence1_length": 510, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "swa_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 440016, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "swa_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 503690, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "swa_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 516311, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "swa_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 568159, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "swa_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 571636, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "swa_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 520229, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "swa_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 515848, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "swa_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 542309, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "swa_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 565617, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "swa_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 542475, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "swa_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 472805, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "swa_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 534084, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "swa_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 550885, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "swa_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 518610, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "swa_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 559481, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "swa_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 384135, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "swa_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 405914, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "swa_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 531906, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "swa_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 564424, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "swa_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 564008, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "swa_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 509940, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "swa_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 550375, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "swa_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 549724, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "swa_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 547402, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "swa_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 563801, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "swa_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 560709, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "swa_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 560651, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "swa_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 524336, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "swa_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 582470, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "swa_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 437747, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "swa_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 606792, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "swa_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 538122, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "swa_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 543630, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "swa_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 512312, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "swa_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 540599, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "swa_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 588597, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "swa_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 363987, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "swa_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 530241, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 136.45918878317477, + "max_sentence1_length": 430, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "swe_Latn-afr_Latn": { + "num_samples": 1997, + "number_of_characters": 520179, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 134.37756634952427, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "swe_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 483008, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "swe_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 495629, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "swe_Latn-dan_Latn": { + "num_samples": 1997, + "number_of_characters": 503965, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 126.25838758137206, + "max_sentence2_length": 522, + "unique_sentence2": 1995 + }, + "swe_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 547477, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "swe_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 550954, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "swe_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 499547, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "swe_Latn-fao_Latn": { + "num_samples": 1997, + "number_of_characters": 509630, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0951427140711, + "max_sentence2_length": 433, + "unique_sentence2": 1997 + }, + "swe_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 495166, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "swe_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 521627, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "swe_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 544935, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "swe_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 452123, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "swe_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 513402, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "swe_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 530203, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "swe_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 538799, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "swe_Latn-isl_Latn": { + "num_samples": 1997, + "number_of_characters": 514035, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 131.30095142714072, + "max_sentence2_length": 399, + "unique_sentence2": 1996 + }, + "swe_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 363453, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "swe_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 385232, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "swe_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 511224, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "swe_Latn-ltz_Latn": { + "num_samples": 1997, + "number_of_characters": 532584, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 140.58938407611416, + "max_sentence2_length": 543, + "unique_sentence2": 1996 + }, + "swe_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 543742, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "swe_Latn-nno_Latn": { + "num_samples": 1997, + "number_of_characters": 500184, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.36504757135704, + "max_sentence2_length": 417, + "unique_sentence2": 1996 + }, + "swe_Latn-nob_Latn": { + "num_samples": 1997, + "number_of_characters": 503271, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.91086629944917, + "max_sentence2_length": 482, + "unique_sentence2": 1996 + }, + "swe_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 529693, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "swe_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 529042, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "swe_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 526720, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "swe_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 540027, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "swe_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 524336, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "swe_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 561788, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "swe_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 517440, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "swe_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 522948, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "swe_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 343305, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "swe_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 509559, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 126.10265398097145, + "max_sentence1_length": 430, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "tah_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 557343, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tah_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 610128, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "tah_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 603043, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "tah_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 596595, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "tah_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 613775, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "tah_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 629931, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "tah_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 583747, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "tah_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 586806, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "tah_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 587478, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "tah_Latn-ton_Latn": { + "num_samples": 1997, + "number_of_characters": 623263, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 155.04406609914872, + "max_sentence1_length": 524, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 157.05558337506258, + "max_sentence2_length": 468, + "unique_sentence2": 1997 + }, + "tam_Taml-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 541142, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "tam_Taml-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 553763, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "tam_Taml-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 605611, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "tam_Taml-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 613809, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "tam_Taml-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 609088, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "tam_Taml-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 557681, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tam_Taml-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 585164, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "tam_Taml-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 553300, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "tam_Taml-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 579761, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "tam_Taml-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 603069, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "tam_Taml-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 554892, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "tam_Taml-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 510257, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "tam_Taml-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 571536, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "tam_Taml-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 588337, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "tam_Taml-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 596933, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "tam_Taml-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 421587, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "tam_Taml-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 575497, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "tam_Taml-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 443366, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "tam_Taml-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 569358, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "tam_Taml-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 570848, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "tam_Taml-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 558184, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "tam_Taml-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 601876, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "tam_Taml-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 560383, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "tam_Taml-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 587827, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "tam_Taml-por_Latn": { + "num_samples": 1997, + "number_of_characters": 587176, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "tam_Taml-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 584854, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "tam_Taml-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 568702, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "tam_Taml-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 530288, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "tam_Taml-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 598161, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "tam_Taml-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 582470, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "tam_Taml-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 561788, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "tam_Taml-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 557488, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "tam_Taml-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 575574, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "tam_Taml-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 557959, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "tam_Taml-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 581082, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "tam_Taml-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 401439, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "tam_Taml-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 567693, + "unique_pairs": 1997, + "min_sentence1_length": 11, + "average_sentence1_length": 155.21331997996995, + "max_sentence1_length": 581, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "tat_Cyrl-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 515560, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "tat_Cyrl-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 492252, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "tat_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 493646, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tat_Cyrl-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 506202, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "tat_Cyrl-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 496790, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "tat_Cyrl-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 531200, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "tat_Cyrl-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 511539, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "tat_Cyrl-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 556948, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "tat_Cyrl-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 539621, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 123.14772158237356, + "max_sentence1_length": 539, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "tel_Telu-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 491329, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "tel_Telu-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 551375, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "tel_Telu-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 495247, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tel_Telu-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 522730, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "tel_Telu-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 492458, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "tel_Telu-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 509102, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "tel_Telu-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 513063, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "tel_Telu-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 508414, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "tel_Telu-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 495750, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "tel_Telu-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 497949, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "tel_Telu-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 506268, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "tel_Telu-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 467854, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "tel_Telu-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 557488, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "tel_Telu-urd_Arab": { + "num_samples": 1997, + "number_of_characters": 495525, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 123.9494241362043, + "max_sentence1_length": 412, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.18527791687531, + "max_sentence2_length": 390, + "unique_sentence2": 1996 + }, + "tgk_Cyrl-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 505328, + "unique_pairs": 1995, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "tgk_Cyrl-ckb_Arab": { + "num_samples": 1997, + "number_of_characters": 526514, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 5, + "average_sentence2_length": 126.37305958938407, + "max_sentence2_length": 399, + "unique_sentence2": 1995 + }, + "tgk_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 521867, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tgk_Cyrl-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 517486, + "unique_pairs": 1995, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "tgk_Cyrl-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 474443, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "tgk_Cyrl-kmr_Latn": { + "num_samples": 1997, + "number_of_characters": 520093, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 7, + "average_sentence2_length": 123.15773660490736, + "max_sentence2_length": 420, + "unique_sentence2": 1996 + }, + "tgk_Cyrl-mey_Arab": { + "num_samples": 1997, + "number_of_characters": 487982, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 6, + "average_sentence2_length": 107.07811717576365, + "max_sentence2_length": 392, + "unique_sentence2": 1993 + }, + "tgk_Cyrl-prs_Arab": { + "num_samples": 1997, + "number_of_characters": 516683, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.45017526289435, + "max_sentence2_length": 365, + "unique_sentence2": 1997 + }, + "tgk_Cyrl-pus_Arab": { + "num_samples": 1997, + "number_of_characters": 516780, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 8, + "average_sentence2_length": 121.49874812218327, + "max_sentence2_length": 366, + "unique_sentence2": 1996 + }, + "tgk_Cyrl-shi_Arab": { + "num_samples": 1997, + "number_of_characters": 489060, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 137.27941912869304, + "max_sentence1_length": 451, + "unique_sentence1": 1995, + "min_sentence2_length": 3, + "average_sentence2_length": 107.6179268903355, + "max_sentence2_length": 378, + "unique_sentence2": 1996 + }, + "tha_Thai-bod_Tibt": { + "num_samples": 1997, + "number_of_characters": 538097, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 150.54031046569855, + "max_sentence2_length": 478, + "unique_sentence2": 1993 + }, + "tha_Thai-dzo_Tibt": { + "num_samples": 1997, + "number_of_characters": 480689, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 121.79318978467701, + "max_sentence2_length": 411, + "unique_sentence2": 1992 + }, + "tha_Thai-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 485188, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tha_Thai-khm_Khmr": { + "num_samples": 1997, + "number_of_characters": 525959, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 144.4621932899349, + "max_sentence2_length": 517, + "unique_sentence2": 1996 + }, + "tha_Thai-lao_Laoo": { + "num_samples": 1997, + "number_of_characters": 504448, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 133.69053580370556, + "max_sentence2_length": 507, + "unique_sentence2": 1997 + }, + "tha_Thai-mon_Mong": { + "num_samples": 1997, + "number_of_characters": 496516, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 129.7185778668002, + "max_sentence2_length": 414, + "unique_sentence2": 1997 + }, + "tha_Thai-mya_Mymr": { + "num_samples": 1997, + "number_of_characters": 549322, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 118.91236855282925, + "max_sentence1_length": 439, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 156.16124186279418, + "max_sentence2_length": 773, + "unique_sentence2": 1997 + }, + "tir_Ethi-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 332745, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "tir_Ethi-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 412958, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tir_Ethi-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 435204, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "tir_Ethi-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 411339, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "tir_Ethi-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 456737, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "tir_Ethi-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 402669, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "tir_Ethi-som_Latn": { + "num_samples": 1997, + "number_of_characters": 456530, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "tir_Ethi-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 453380, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "tir_Ethi-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 437747, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "tir_Ethi-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 499521, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "tir_Ethi-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 405041, + "unique_pairs": 1996, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "tir_Ethi-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 433328, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "tir_Ethi-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 481326, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "tir_Ethi-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 422970, + "unique_pairs": 1997, + "min_sentence1_length": 5, + "average_sentence1_length": 82.743114672008, + "max_sentence1_length": 272, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "ton_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 561360, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ton_Latn-fij_Latn": { + "num_samples": 1997, + "number_of_characters": 614145, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 150.478217325989, + "max_sentence2_length": 448, + "unique_sentence2": 1988 + }, + "ton_Latn-fil_Latn": { + "num_samples": 1997, + "number_of_characters": 607060, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 146.93039559339007, + "max_sentence2_length": 554, + "unique_sentence2": 1997 + }, + "ton_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 600612, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "ton_Latn-mal_Mlym": { + "num_samples": 1997, + "number_of_characters": 617792, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 152.30445668502753, + "max_sentence2_length": 540, + "unique_sentence2": 1996 + }, + "ton_Latn-mlg_Latn": { + "num_samples": 1997, + "number_of_characters": 633948, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 160.39459188783175, + "max_sentence2_length": 559, + "unique_sentence2": 1994 + }, + "ton_Latn-mri_Latn": { + "num_samples": 1997, + "number_of_characters": 587764, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 137.26790185277918, + "max_sentence2_length": 443, + "unique_sentence2": 1997 + }, + "ton_Latn-msa_Latn": { + "num_samples": 1997, + "number_of_characters": 590823, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 138.79969954932398, + "max_sentence2_length": 463, + "unique_sentence2": 1997 + }, + "ton_Latn-smo_Latn": { + "num_samples": 1997, + "number_of_characters": 591495, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 139.1362043064597, + "max_sentence2_length": 431, + "unique_sentence2": 1996 + }, + "ton_Latn-tah_Latn": { + "num_samples": 1997, + "number_of_characters": 623263, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 157.05558337506258, + "max_sentence1_length": 468, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 155.04406609914872, + "max_sentence2_length": 524, + "unique_sentence2": 1997 + }, + "tsn_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 501790, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "tsn_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 582003, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tsn_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 604249, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "tsn_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 580384, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "tsn_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 625782, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "tsn_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 571714, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "tsn_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 625575, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "tsn_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 622425, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "tsn_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 606792, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "tsn_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 499521, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "tsn_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 574086, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "tsn_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 602373, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "tsn_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 650371, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "tsn_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 592015, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 167.39258888332498, + "max_sentence1_length": 556, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "tuk_Latn-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 554908, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "tuk_Latn-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 531600, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "tuk_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 532994, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tuk_Latn-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 545550, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "tuk_Latn-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 536138, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "tuk_Latn-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 531200, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "tuk_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 550887, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "tuk_Latn-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 596296, + "unique_pairs": 1997, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "tuk_Latn-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 578969, + "unique_pairs": 1996, + "min_sentence1_length": 9, + "average_sentence1_length": 142.85127691537306, + "max_sentence1_length": 576, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "tur_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 496794, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "tur_Latn-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 535247, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "tur_Latn-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511939, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "tur_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 509415, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "tur_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 561263, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "tur_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 564740, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "tur_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 513333, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "tur_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 508952, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "tur_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 535413, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "tur_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 558721, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "tur_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 465909, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "tur_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 527188, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "tur_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 543989, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "tur_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 552585, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "tur_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 377239, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "tur_Latn-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 525889, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "tur_Latn-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 516477, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "tur_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 399018, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "tur_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 525010, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "tur_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 557528, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "tur_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 543479, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "tur_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 542828, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "tur_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 540506, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "tur_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 553813, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "tur_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 538122, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "tur_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 517440, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "tur_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 575574, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "tur_Latn-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 511539, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "tur_Latn-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 550887, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "tur_Latn-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 576635, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "tur_Latn-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 559308, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "tur_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 536734, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "tur_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 357091, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "tur_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 523345, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 133.00600901352027, + "max_sentence1_length": 504, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "uig_Arab-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 580656, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "uig_Arab-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 557348, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "uig_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 558742, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "uig_Arab-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 571298, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "uig_Arab-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 561886, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "uig_Arab-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 556948, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "uig_Arab-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 596296, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "uig_Arab-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 576635, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "uig_Arab-uzb_Latn": { + "num_samples": 1997, + "number_of_characters": 604717, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 155.74461692538807, + "max_sentence1_length": 592, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 147.06810215322986, + "max_sentence2_length": 470, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-bel_Cyrl": { + "num_samples": 1997, + "number_of_characters": 518873, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.2373560340511, + "max_sentence2_length": 422, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-bos_Latn": { + "num_samples": 1997, + "number_of_characters": 517693, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 127.64646970455684, + "max_sentence2_length": 434, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-bul_Cyrl": { + "num_samples": 1997, + "number_of_characters": 532672, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 135.14722083124687, + "max_sentence2_length": 493, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-ces_Latn": { + "num_samples": 1997, + "number_of_characters": 504101, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 120.84026039058588, + "max_sentence2_length": 474, + "unique_sentence2": 1997 + }, + "ukr_Cyrl-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 510503, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ukr_Cyrl-hrv_Latn": { + "num_samples": 1997, + "number_of_characters": 518708, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 128.1547320981472, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "ukr_Cyrl-mkd_Cyrl": { + "num_samples": 1997, + "number_of_characters": 530674, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.1467200801202, + "max_sentence2_length": 451, + "unique_sentence2": 1997 + }, + "ukr_Cyrl-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 540649, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 537676, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-slk_Latn": { + "num_samples": 1997, + "number_of_characters": 515752, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 126.67451176765148, + "max_sentence2_length": 403, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-slv_Latn": { + "num_samples": 1997, + "number_of_characters": 515679, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.63795693540311, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "ukr_Cyrl-srp_Cyrl": { + "num_samples": 1997, + "number_of_characters": 515086, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 126.34101151727592, + "max_sentence2_length": 439, + "unique_sentence2": 1995 + }, + "ukr_Cyrl-srp_Latn": { + "num_samples": 1997, + "number_of_characters": 518924, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 131.58888332498748, + "max_sentence1_length": 440, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 128.26289434151226, + "max_sentence2_length": 452, + "unique_sentence2": 1996 + }, + "urd_Arab-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 491800, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "urd_Arab-div_Thaa": { + "num_samples": 1997, + "number_of_characters": 551846, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 152.15222834251378, + "max_sentence2_length": 609, + "unique_sentence2": 1996 + }, + "urd_Arab-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 495718, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "urd_Arab-eus_Latn": { + "num_samples": 1997, + "number_of_characters": 523201, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 137.80821231847773, + "max_sentence2_length": 393, + "unique_sentence2": 1997 + }, + "urd_Arab-guj_Gujr": { + "num_samples": 1997, + "number_of_characters": 492929, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 122.64947421131697, + "max_sentence2_length": 378, + "unique_sentence2": 1997 + }, + "urd_Arab-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 509573, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "urd_Arab-kan_Knda": { + "num_samples": 1997, + "number_of_characters": 513534, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 132.96745117676514, + "max_sentence2_length": 449, + "unique_sentence2": 1996 + }, + "urd_Arab-mar_Deva": { + "num_samples": 1997, + "number_of_characters": 508885, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 130.63945918878318, + "max_sentence2_length": 443, + "unique_sentence2": 1995 + }, + "urd_Arab-nep_Deva": { + "num_samples": 1997, + "number_of_characters": 496221, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 124.29794692038057, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "urd_Arab-pan_Guru": { + "num_samples": 1997, + "number_of_characters": 498420, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 125.39909864797195, + "max_sentence2_length": 383, + "unique_sentence2": 1996 + }, + "urd_Arab-sin_Sinh": { + "num_samples": 1997, + "number_of_characters": 506739, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 129.56484727090637, + "max_sentence2_length": 441, + "unique_sentence2": 1996 + }, + "urd_Arab-snd_Arab": { + "num_samples": 1997, + "number_of_characters": 468325, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 110.32899349023535, + "max_sentence2_length": 335, + "unique_sentence2": 1996 + }, + "urd_Arab-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 557959, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "urd_Arab-tel_Telu": { + "num_samples": 1997, + "number_of_characters": 495525, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 124.18527791687531, + "max_sentence1_length": 390, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 123.9494241362043, + "max_sentence2_length": 412, + "unique_sentence2": 1996 + }, + "uzb_Latn-aze_Latn": { + "num_samples": 1997, + "number_of_characters": 563329, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 135.0195292939409, + "max_sentence2_length": 398, + "unique_sentence2": 1997 + }, + "uzb_Latn-bak_Cyrl": { + "num_samples": 1997, + "number_of_characters": 540021, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 123.34802203304957, + "max_sentence2_length": 437, + "unique_sentence2": 1995 + }, + "uzb_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 541415, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "uzb_Latn-kaz_Cyrl": { + "num_samples": 1997, + "number_of_characters": 553971, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 130.33350025037555, + "max_sentence2_length": 473, + "unique_sentence2": 1996 + }, + "uzb_Latn-kir_Cyrl": { + "num_samples": 1997, + "number_of_characters": 544559, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 125.62043064596895, + "max_sentence2_length": 395, + "unique_sentence2": 1996 + }, + "uzb_Latn-tat_Cyrl": { + "num_samples": 1997, + "number_of_characters": 539621, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 123.14772158237356, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "uzb_Latn-tuk_Latn": { + "num_samples": 1997, + "number_of_characters": 578969, + "unique_pairs": 1996, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 142.85127691537306, + "max_sentence2_length": 576, + "unique_sentence2": 1996 + }, + "uzb_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 559308, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "uzb_Latn-uig_Arab": { + "num_samples": 1997, + "number_of_characters": 604717, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 147.06810215322986, + "max_sentence1_length": 470, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 155.74461692538807, + "max_sentence2_length": 592, + "unique_sentence2": 1996 + }, + "ven_Latn-bem_Latn": { + "num_samples": 1997, + "number_of_characters": 598248, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 8, + "average_sentence2_length": 149.47020530796195, + "max_sentence2_length": 465, + "unique_sentence2": 1997 + }, + "ven_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 547476, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "ven_Latn-ewe_Latn": { + "num_samples": 1997, + "number_of_characters": 538734, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 119.6685027541312, + "max_sentence2_length": 493, + "unique_sentence2": 1994 + }, + "ven_Latn-fuc_Latn": { + "num_samples": 1997, + "number_of_characters": 528236, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 114.4116174261392, + "max_sentence2_length": 376, + "unique_sentence2": 1996 + }, + "ven_Latn-kin_Latn": { + "num_samples": 1997, + "number_of_characters": 603543, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 10, + "average_sentence2_length": 152.12168252378567, + "max_sentence2_length": 541, + "unique_sentence2": 1996 + }, + "ven_Latn-nde_Latn": { + "num_samples": 1997, + "number_of_characters": 597495, + "unique_pairs": 1997, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 149.09313970956435, + "max_sentence2_length": 590, + "unique_sentence2": 1997 + }, + "ven_Latn-nya_Latn": { + "num_samples": 1997, + "number_of_characters": 584038, + "unique_pairs": 1996, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 10, + "average_sentence2_length": 142.35453179769655, + "max_sentence2_length": 464, + "unique_sentence2": 1993 + }, + "ven_Latn-sna_Latn": { + "num_samples": 1997, + "number_of_characters": 598086, + "unique_pairs": 1995, + "min_sentence1_length": 10, + "average_sentence1_length": 150.10315473209815, + "max_sentence1_length": 535, + "unique_sentence1": 1993, + "min_sentence2_length": 6, + "average_sentence2_length": 149.38908362543816, + "max_sentence2_length": 511, + "unique_sentence2": 1995 + }, + "vie_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 502302, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "vie_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 514923, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "vie_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 566771, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "vie_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 570248, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "vie_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 518841, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "vie_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 514460, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "vie_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 540921, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "vie_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 564229, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "vie_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 471417, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "vie_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 532696, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "vie_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 549497, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "vie_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 558093, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "vie_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 382747, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "vie_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 404526, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "vie_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 530518, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "vie_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 563036, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "vie_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 548987, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "vie_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 548336, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "vie_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 546014, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "vie_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 559321, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "vie_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 543630, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "vie_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 522948, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "vie_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 581082, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "vie_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 536734, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "vie_Latn-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 350008, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "vie_Latn-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 356082, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "vie_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 362599, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "vie_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 528853, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 135.764146219329, + "max_sentence1_length": 437, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "wol_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 407310, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "wol_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 487523, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "wol_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 509769, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "wol_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 485904, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "wol_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 531302, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "wol_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 477234, + "unique_pairs": 1992, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "wol_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 531095, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "wol_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 527945, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "wol_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 512312, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "wol_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 405041, + "unique_pairs": 1996, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "wol_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 574086, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "wol_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 507893, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "wol_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 555891, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "wol_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 497535, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 120.08162243365048, + "max_sentence1_length": 405, + "unique_sentence1": 1990, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "xho_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 435597, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "xho_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 515810, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "xho_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 538056, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "xho_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 514191, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "xho_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 559589, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "xho_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 505521, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "xho_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 559382, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "xho_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 556232, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "xho_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 540599, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "xho_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 433328, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "xho_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 602373, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "xho_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 507893, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "xho_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 584178, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "xho_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 525822, + "unique_pairs": 1997, + "min_sentence1_length": 6, + "average_sentence1_length": 134.2463695543315, + "max_sentence1_length": 492, + "unique_sentence1": 1997, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "yor_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 483595, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "yor_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 563808, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "yor_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 586054, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "yor_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 562189, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "yor_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 607587, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "yor_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 553519, + "unique_pairs": 1996, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "yor_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 607380, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "yor_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 604230, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "yor_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 588597, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "yor_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 481326, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "yor_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 650371, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "yor_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 555891, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "yor_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 584178, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "yor_Latn-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 573820, + "unique_pairs": 1997, + "min_sentence1_length": 7, + "average_sentence1_length": 158.2814221331998, + "max_sentence1_length": 582, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "yue_Hant-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 326607, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "yue_Hant-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 190513, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "yue_Hant-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 212292, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "yue_Hant-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 350008, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "yue_Hant-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 163848, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "yue_Hant-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 170365, + "unique_pairs": 1996, + "min_sentence1_length": 4, + "average_sentence1_length": 39.502754131196795, + "max_sentence1_length": 133, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "zho_Hans-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 332681, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "zho_Hans-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 196587, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "zho_Hans-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 218366, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "zho_Hans-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 356082, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "zho_Hans-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 163848, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "zho_Hans-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 176439, + "unique_pairs": 1997, + "min_sentence1_length": 4, + "average_sentence1_length": 42.54431647471207, + "max_sentence1_length": 263, + "unique_sentence1": 1997, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + }, + "zho_Hant-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 322659, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "zho_Hant-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 335280, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "zho_Hant-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 387128, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "zho_Hant-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 390605, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "zho_Hant-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 339198, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "zho_Hant-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 334817, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "zho_Hant-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 361278, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "zho_Hant-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 384586, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "zho_Hant-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 291774, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "zho_Hant-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 353053, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "zho_Hant-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 369854, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "zho_Hant-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 378450, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "zho_Hant-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 203104, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "zho_Hant-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 224883, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "zho_Hant-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 350875, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "zho_Hant-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 383393, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "zho_Hant-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 369344, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "zho_Hant-por_Latn": { + "num_samples": 1997, + "number_of_characters": 368693, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "zho_Hant-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 366371, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "zho_Hant-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 379678, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "zho_Hant-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 363987, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "zho_Hant-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 343305, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "zho_Hant-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 401439, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "zho_Hant-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 357091, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "zho_Hant-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 362599, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "zho_Hant-yue_Hant": { + "num_samples": 1997, + "number_of_characters": 170365, + "unique_pairs": 1996, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 39.502754131196795, + "max_sentence2_length": 133, + "unique_sentence2": 1996 + }, + "zho_Hant-zho_Hans": { + "num_samples": 1997, + "number_of_characters": 176439, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 42.54431647471207, + "max_sentence2_length": 263, + "unique_sentence2": 1997 + }, + "zho_Hant-zul_Latn": { + "num_samples": 1997, + "number_of_characters": 349210, + "unique_pairs": 1997, + "min_sentence1_length": 3, + "average_sentence1_length": 45.80771156735103, + "max_sentence1_length": 200, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 129.0595893840761, + "max_sentence2_length": 494, + "unique_sentence2": 1996 + }, + "zul_Latn-amh_Ethi": { + "num_samples": 1997, + "number_of_characters": 425239, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 83.87931897846771, + "max_sentence2_length": 290, + "unique_sentence2": 1994 + }, + "zul_Latn-arb_Arab": { + "num_samples": 1997, + "number_of_characters": 488913, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 115.76414621932899, + "max_sentence2_length": 362, + "unique_sentence2": 1995 + }, + "zul_Latn-ben_Beng": { + "num_samples": 1997, + "number_of_characters": 501534, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 122.08412618928392, + "max_sentence2_length": 402, + "unique_sentence2": 1997 + }, + "zul_Latn-deu_Latn": { + "num_samples": 1997, + "number_of_characters": 553382, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 148.04707060590886, + "max_sentence2_length": 508, + "unique_sentence2": 1996 + }, + "zul_Latn-ell_Grek": { + "num_samples": 1997, + "number_of_characters": 556859, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 149.78818227341011, + "max_sentence2_length": 584, + "unique_sentence2": 1996 + }, + "zul_Latn-eng_Latn": { + "num_samples": 1997, + "number_of_characters": 505452, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 124.04606910365548, + "max_sentence2_length": 437, + "unique_sentence2": 1997 + }, + "zul_Latn-fas_Arab": { + "num_samples": 1997, + "number_of_characters": 501071, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 121.85227841762644, + "max_sentence2_length": 389, + "unique_sentence2": 1995 + }, + "zul_Latn-fin_Latn": { + "num_samples": 1997, + "number_of_characters": 527532, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.10265398097147, + "max_sentence2_length": 463, + "unique_sentence2": 1996 + }, + "zul_Latn-fra_Latn": { + "num_samples": 1997, + "number_of_characters": 550840, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.7741612418628, + "max_sentence2_length": 512, + "unique_sentence2": 1996 + }, + "zul_Latn-hau_Latn": { + "num_samples": 1997, + "number_of_characters": 527698, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 135.185778668002, + "max_sentence2_length": 483, + "unique_sentence2": 1997 + }, + "zul_Latn-heb_Hebr": { + "num_samples": 1997, + "number_of_characters": 458028, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 100.29844767150726, + "max_sentence2_length": 375, + "unique_sentence2": 1996 + }, + "zul_Latn-hin_Deva": { + "num_samples": 1997, + "number_of_characters": 519307, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 130.9839759639459, + "max_sentence2_length": 394, + "unique_sentence2": 1996 + }, + "zul_Latn-hun_Latn": { + "num_samples": 1997, + "number_of_characters": 536108, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 139.3970956434652, + "max_sentence2_length": 508, + "unique_sentence2": 1997 + }, + "zul_Latn-ibo_Latn": { + "num_samples": 1997, + "number_of_characters": 503833, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 123.23535302954431, + "max_sentence2_length": 469, + "unique_sentence2": 1997 + }, + "zul_Latn-ind_Latn": { + "num_samples": 1997, + "number_of_characters": 544704, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 143.70155232849274, + "max_sentence2_length": 486, + "unique_sentence2": 1997 + }, + "zul_Latn-jpn_Jpan": { + "num_samples": 1997, + "number_of_characters": 369358, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 4, + "average_sentence2_length": 55.89684526790185, + "max_sentence2_length": 189, + "unique_sentence2": 1994 + }, + "zul_Latn-kor_Hang": { + "num_samples": 1997, + "number_of_characters": 391137, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 66.80270405608412, + "max_sentence2_length": 217, + "unique_sentence2": 1995 + }, + "zul_Latn-lit_Latn": { + "num_samples": 1997, + "number_of_characters": 517129, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 129.893340010015, + "max_sentence2_length": 446, + "unique_sentence2": 1995 + }, + "zul_Latn-nld_Latn": { + "num_samples": 1997, + "number_of_characters": 549647, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 146.1767651477216, + "max_sentence2_length": 539, + "unique_sentence2": 1996 + }, + "zul_Latn-nso_Latn": { + "num_samples": 1997, + "number_of_characters": 549231, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 145.96845267901853, + "max_sentence2_length": 487, + "unique_sentence2": 1996 + }, + "zul_Latn-orm_Ethi": { + "num_samples": 1997, + "number_of_characters": 495163, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 118.89384076114172, + "max_sentence2_length": 466, + "unique_sentence2": 1984 + }, + "zul_Latn-pol_Latn": { + "num_samples": 1997, + "number_of_characters": 535598, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 9, + "average_sentence2_length": 139.14171256885328, + "max_sentence2_length": 468, + "unique_sentence2": 1996 + }, + "zul_Latn-por_Latn": { + "num_samples": 1997, + "number_of_characters": 534947, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 138.81572358537807, + "max_sentence2_length": 497, + "unique_sentence2": 1996 + }, + "zul_Latn-rus_Cyrl": { + "num_samples": 1997, + "number_of_characters": 532625, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 137.6529794692038, + "max_sentence2_length": 419, + "unique_sentence2": 1996 + }, + "zul_Latn-som_Latn": { + "num_samples": 1997, + "number_of_characters": 549024, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 145.8647971957937, + "max_sentence2_length": 455, + "unique_sentence2": 1997 + }, + "zul_Latn-spa_Latn": { + "num_samples": 1997, + "number_of_characters": 545932, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 1, + "average_sentence2_length": 144.3164747120681, + "max_sentence2_length": 504, + "unique_sentence2": 1996 + }, + "zul_Latn-ssw_Latn": { + "num_samples": 1997, + "number_of_characters": 545874, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 144.28743114672008, + "max_sentence2_length": 510, + "unique_sentence2": 1996 + }, + "zul_Latn-swa_Latn": { + "num_samples": 1997, + "number_of_characters": 530241, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 10, + "average_sentence2_length": 136.45918878317477, + "max_sentence2_length": 430, + "unique_sentence2": 1997 + }, + "zul_Latn-swe_Latn": { + "num_samples": 1997, + "number_of_characters": 509559, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 8, + "average_sentence2_length": 126.10265398097145, + "max_sentence2_length": 430, + "unique_sentence2": 1996 + }, + "zul_Latn-tam_Taml": { + "num_samples": 1997, + "number_of_characters": 567693, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 11, + "average_sentence2_length": 155.21331997996995, + "max_sentence2_length": 581, + "unique_sentence2": 1997 + }, + "zul_Latn-tir_Ethi": { + "num_samples": 1997, + "number_of_characters": 422970, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 5, + "average_sentence2_length": 82.743114672008, + "max_sentence2_length": 272, + "unique_sentence2": 1996 + }, + "zul_Latn-tsn_Latn": { + "num_samples": 1997, + "number_of_characters": 592015, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 167.39258888332498, + "max_sentence2_length": 556, + "unique_sentence2": 1997 + }, + "zul_Latn-tur_Latn": { + "num_samples": 1997, + "number_of_characters": 523345, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 133.00600901352027, + "max_sentence2_length": 504, + "unique_sentence2": 1997 + }, + "zul_Latn-vie_Latn": { + "num_samples": 1997, + "number_of_characters": 528853, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 135.764146219329, + "max_sentence2_length": 437, + "unique_sentence2": 1996 + }, + "zul_Latn-wol_Latn": { + "num_samples": 1997, + "number_of_characters": 497535, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 120.08162243365048, + "max_sentence2_length": 405, + "unique_sentence2": 1990 + }, + "zul_Latn-xho_Latn": { + "num_samples": 1997, + "number_of_characters": 525822, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 6, + "average_sentence2_length": 134.2463695543315, + "max_sentence2_length": 492, + "unique_sentence2": 1997 + }, + "zul_Latn-yor_Latn": { + "num_samples": 1997, + "number_of_characters": 573820, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 7, + "average_sentence2_length": 158.2814221331998, + "max_sentence2_length": 582, + "unique_sentence2": 1996 + }, + "zul_Latn-zho_Hant": { + "num_samples": 1997, + "number_of_characters": 349210, + "unique_pairs": 1997, + "min_sentence1_length": 8, + "average_sentence1_length": 129.0595893840761, + "max_sentence1_length": 494, + "unique_sentence1": 1996, + "min_sentence2_length": 3, + "average_sentence2_length": 45.80771156735103, + "max_sentence2_length": 200, + "unique_sentence2": 1996 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/NollySentiBitextMining.json b/mteb/descriptive_stats/BitextMining/NollySentiBitextMining.json new file mode 100644 index 000000000..754f13c76 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/NollySentiBitextMining.json @@ -0,0 +1,69 @@ +{ + "train": { + "num_samples": 1640, + "number_of_characters": 445805, + "unique_pairs": 1632, + "min_sentence1_length": 3, + "average_sentence1_length": 136.3170731707317, + "max_sentence1_length": 1698, + "unique_sentence1": 405, + "min_sentence2_length": 3, + "average_sentence2_length": 135.515243902439, + "max_sentence2_length": 1728, + "unique_sentence2": 1631, + "hf_subset_descriptive_stats": { + "en-ha": { + "num_samples": 410, + "number_of_characters": 115348, + "unique_pairs": 407, + "min_sentence1_length": 3, + "average_sentence1_length": 136.3170731707317, + "max_sentence1_length": 1698, + "unique_sentence1": 405, + "min_sentence2_length": 4, + "average_sentence2_length": 145.01951219512196, + "max_sentence2_length": 1728, + "unique_sentence2": 407 + }, + "en-ig": { + "num_samples": 410, + "number_of_characters": 107173, + "unique_pairs": 409, + "min_sentence1_length": 3, + "average_sentence1_length": 136.3170731707317, + "max_sentence1_length": 1698, + "unique_sentence1": 405, + "min_sentence2_length": 5, + "average_sentence2_length": 125.08048780487805, + "max_sentence2_length": 1137, + "unique_sentence2": 408 + }, + "en-pcm": { + "num_samples": 410, + "number_of_characters": 109955, + "unique_pairs": 408, + "min_sentence1_length": 3, + "average_sentence1_length": 136.3170731707317, + "max_sentence1_length": 1698, + "unique_sentence1": 405, + "min_sentence2_length": 3, + "average_sentence2_length": 131.8658536585366, + "max_sentence2_length": 1552, + "unique_sentence2": 408 + }, + "en-yo": { + "num_samples": 410, + "number_of_characters": 113329, + "unique_pairs": 409, + "min_sentence1_length": 3, + "average_sentence1_length": 136.3170731707317, + "max_sentence1_length": 1698, + "unique_sentence1": 405, + "min_sentence2_length": 6, + "average_sentence2_length": 140.0951219512195, + "max_sentence2_length": 1338, + "unique_sentence2": 409 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/NorwegianCourtsBitextMining.json b/mteb/descriptive_stats/BitextMining/NorwegianCourtsBitextMining.json new file mode 100644 index 000000000..96403e4c8 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/NorwegianCourtsBitextMining.json @@ -0,0 +1,15 @@ +{ + "test": { + "num_samples": 228, + "number_of_characters": 37441, + "unique_pairs": 228, + "min_sentence1_length": 13, + "average_sentence1_length": 82.19736842105263, + "max_sentence1_length": 272, + "unique_sentence1": 227, + "min_sentence2_length": 10, + "average_sentence2_length": 82.01754385964912, + "max_sentence2_length": 269, + "unique_sentence2": 226 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/NusaTranslationBitextMining.json b/mteb/descriptive_stats/BitextMining/NusaTranslationBitextMining.json index 60a8e055c..9efdf2f8d 100644 --- a/mteb/descriptive_stats/BitextMining/NusaTranslationBitextMining.json +++ b/mteb/descriptive_stats/BitextMining/NusaTranslationBitextMining.json @@ -1,75 +1,159 @@ { "train": { - "average_sentence1_length": 145.4552390438247, - "average_sentence2_length": 148.56607569721115, "num_samples": 50200, "number_of_characters": 14759870, + "unique_pairs": 50140, + "min_sentence1_length": 5, + "average_sentence1_length": 145.4552390438247, + "max_sentence1_length": 873, + "unique_sentence1": 8258, + "min_sentence2_length": 5, + "average_sentence2_length": 148.56607569721115, + "max_sentence2_length": 980, + "unique_sentence2": 50102, "hf_subset_descriptive_stats": { "ind-abs": { + "num_samples": 1000, + "number_of_characters": 295680, + "unique_pairs": 999, + "min_sentence1_length": 5, "average_sentence1_length": 148.366, + "max_sentence1_length": 727, + "unique_sentence1": 998, + "min_sentence2_length": 6, "average_sentence2_length": 147.314, - "num_samples": 1000, - "number_of_characters": 295680 + "max_sentence2_length": 629, + "unique_sentence2": 998 }, "ind-btk": { + "num_samples": 6600, + "number_of_characters": 1927907, + "unique_pairs": 6597, + "min_sentence1_length": 5, "average_sentence1_length": 145.36666666666667, + "max_sentence1_length": 873, + "unique_sentence1": 6521, + "min_sentence2_length": 5, "average_sentence2_length": 146.74045454545455, - "num_samples": 6600, - "number_of_characters": 1927907 + "max_sentence2_length": 980, + "unique_sentence2": 6596 }, "ind-bew": { + "num_samples": 6600, + "number_of_characters": 1939300, + "unique_pairs": 6595, + "min_sentence1_length": 5, "average_sentence1_length": 145.4280303030303, + "max_sentence1_length": 873, + "unique_sentence1": 6512, + "min_sentence2_length": 6, "average_sentence2_length": 148.40530303030303, - "num_samples": 6600, - "number_of_characters": 1939300 + "max_sentence2_length": 840, + "unique_sentence2": 6590 }, "ind-bhp": { + "num_samples": 1000, + "number_of_characters": 261666, + "unique_pairs": 1000, + "min_sentence1_length": 11, "average_sentence1_length": 133.528, + "max_sentence1_length": 468, + "unique_sentence1": 999, + "min_sentence2_length": 10, "average_sentence2_length": 128.138, - "num_samples": 1000, - "number_of_characters": 261666 + "max_sentence2_length": 459, + "unique_sentence2": 999 }, "ind-jav": { + "num_samples": 6600, + "number_of_characters": 1922162, + "unique_pairs": 6594, + "min_sentence1_length": 5, "average_sentence1_length": 145.42772727272728, + "max_sentence1_length": 873, + "unique_sentence1": 6512, + "min_sentence2_length": 5, "average_sentence2_length": 145.8089393939394, - "num_samples": 6600, - "number_of_characters": 1922162 + "max_sentence2_length": 854, + "unique_sentence2": 6585 }, "ind-mad": { + "num_samples": 6600, + "number_of_characters": 1973257, + "unique_pairs": 6598, + "min_sentence1_length": 5, "average_sentence1_length": 145.35545454545453, + "max_sentence1_length": 873, + "unique_sentence1": 6521, + "min_sentence2_length": 5, "average_sentence2_length": 153.6228787878788, - "num_samples": 6600, - "number_of_characters": 1973257 + "max_sentence2_length": 827, + "unique_sentence2": 6592 }, "ind-mak": { + "num_samples": 6600, + "number_of_characters": 1953868, + "unique_pairs": 6594, + "min_sentence1_length": 5, "average_sentence1_length": 145.42772727272728, + "max_sentence1_length": 873, + "unique_sentence1": 6512, + "min_sentence2_length": 6, "average_sentence2_length": 150.6128787878788, - "num_samples": 6600, - "number_of_characters": 1953868 + "max_sentence2_length": 888, + "unique_sentence2": 6586 }, "ind-min": { + "num_samples": 6600, + "number_of_characters": 1937033, + "unique_pairs": 6595, + "min_sentence1_length": 5, "average_sentence1_length": 145.42772727272728, + "max_sentence1_length": 873, + "unique_sentence1": 6512, + "min_sentence2_length": 6, "average_sentence2_length": 148.0621212121212, - "num_samples": 6600, - "number_of_characters": 1937033 + "max_sentence2_length": 837, + "unique_sentence2": 6591 }, "ind-mui": { + "num_samples": 1000, + "number_of_characters": 301448, + "unique_pairs": 1000, + "min_sentence1_length": 11, "average_sentence1_length": 150.454, + "max_sentence1_length": 451, + "unique_sentence1": 997, + "min_sentence2_length": 11, "average_sentence2_length": 150.994, - "num_samples": 1000, - "number_of_characters": 301448 + "max_sentence2_length": 450, + "unique_sentence2": 1000 }, "ind-rej": { + "num_samples": 1000, + "number_of_characters": 291205, + "unique_pairs": 1000, + "min_sentence1_length": 9, "average_sentence1_length": 151.622, + "max_sentence1_length": 873, + "unique_sentence1": 998, + "min_sentence2_length": 8, "average_sentence2_length": 139.583, - "num_samples": 1000, - "number_of_characters": 291205 + "max_sentence2_length": 784, + "unique_sentence2": 1000 }, "ind-sun": { + "num_samples": 6600, + "number_of_characters": 1956344, + "unique_pairs": 6591, + "min_sentence1_length": 5, "average_sentence1_length": 145.42772727272728, + "max_sentence1_length": 873, + "unique_sentence1": 6512, + "min_sentence2_length": 5, "average_sentence2_length": 150.9880303030303, - "num_samples": 6600, - "number_of_characters": 1956344 + "max_sentence2_length": 881, + "unique_sentence2": 6588 } } } diff --git a/mteb/descriptive_stats/BitextMining/PhincBitextMining.json b/mteb/descriptive_stats/BitextMining/PhincBitextMining.json new file mode 100644 index 000000000..f4b237d87 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/PhincBitextMining.json @@ -0,0 +1,30 @@ +{ + "train": { + "num_samples": 13738, + "number_of_characters": 2069457, + "unique_pairs": 13737, + "min_sentence1_length": 1, + "average_sentence1_length": 74.02300189256079, + "max_sentence1_length": 278, + "unique_sentence1": 13515, + "min_sentence2_length": 3, + "average_sentence2_length": 76.61442713640996, + "max_sentence2_length": 274, + "unique_sentence2": 13736, + "hf_subset_descriptive_stats": { + "eng-eng_hin": { + "num_samples": 13738, + "number_of_characters": 2069457, + "unique_pairs": 13737, + "min_sentence1_length": 1, + "average_sentence1_length": 74.02300189256079, + "max_sentence1_length": 278, + "unique_sentence1": 13515, + "min_sentence2_length": 3, + "average_sentence2_length": 76.61442713640996, + "max_sentence2_length": 274, + "unique_sentence2": 13736 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/TbilisiCityHallBitextMining.json b/mteb/descriptive_stats/BitextMining/TbilisiCityHallBitextMining.json new file mode 100644 index 000000000..12f400372 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/TbilisiCityHallBitextMining.json @@ -0,0 +1,43 @@ +{ + "test": { + "num_samples": 3640, + "number_of_characters": 572146, + "unique_pairs": 3640, + "min_sentence1_length": 13, + "average_sentence1_length": 78.59148351648352, + "max_sentence1_length": 203, + "unique_sentence1": 3636, + "min_sentence2_length": 13, + "average_sentence2_length": 78.59148351648352, + "max_sentence2_length": 203, + "unique_sentence2": 3636, + "hf_subset_descriptive_stats": { + "kat_Geor-eng_Latn": { + "num_samples": 1820, + "number_of_characters": 286073, + "unique_pairs": 1820, + "min_sentence1_length": 30, + "average_sentence1_length": 76.06593406593407, + "max_sentence1_length": 189, + "unique_sentence1": 1820, + "min_sentence2_length": 13, + "average_sentence2_length": 81.11703296703297, + "max_sentence2_length": 203, + "unique_sentence2": 1816 + }, + "eng_Latn-kat_Geor": { + "num_samples": 1820, + "number_of_characters": 286073, + "unique_pairs": 1820, + "min_sentence1_length": 13, + "average_sentence1_length": 81.11703296703297, + "max_sentence1_length": 203, + "unique_sentence1": 1816, + "min_sentence2_length": 30, + "average_sentence2_length": 76.06593406593407, + "max_sentence2_length": 189, + "unique_sentence2": 1820 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/VieMedEVBitextMining.json b/mteb/descriptive_stats/BitextMining/VieMedEVBitextMining.json new file mode 100644 index 000000000..2d97df573 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/VieMedEVBitextMining.json @@ -0,0 +1,15 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 575910, + "unique_pairs": 2048, + "min_sentence1_length": 11, + "average_sentence1_length": 139.22802734375, + "max_sentence1_length": 1291, + "unique_sentence1": 2048, + "min_sentence2_length": 11, + "average_sentence2_length": 141.97802734375, + "max_sentence2_length": 1217, + "unique_sentence2": 2047 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/LanguageClassification.json b/mteb/descriptive_stats/Classification/LanguageClassification.json index cf8b83d5d..6622d23be 100644 --- a/mteb/descriptive_stats/Classification/LanguageClassification.json +++ b/mteb/descriptive_stats/Classification/LanguageClassification.json @@ -2,7 +2,11 @@ "test": { "num_samples": 2048, "number_of_characters": 224352, + "num_texts_in_train": 31, + "min_text_length": 14, "average_text_length": 109.546875, + "max_text_length": 1270, + "unique_text": 2025, "unique_labels": 20, "labels": { "17": { @@ -66,5 +70,77 @@ "count": 103 } } + }, + "train": { + "num_samples": 70000, + "number_of_characters": 7760299, + "num_texts_in_train": null, + "min_text_length": 2, + "average_text_length": 110.86141428571429, + "max_text_length": 2422, + "unique_text": 68978, + "unique_labels": 20, + "labels": { + "12": { + "count": 3500 + }, + "1": { + "count": 3500 + }, + "19": { + "count": 3500 + }, + "15": { + "count": 3500 + }, + "13": { + "count": 3500 + }, + "11": { + "count": 3500 + }, + "17": { + "count": 3500 + }, + "14": { + "count": 3500 + }, + "16": { + "count": 3500 + }, + "5": { + "count": 3500 + }, + "0": { + "count": 3500 + }, + "8": { + "count": 3500 + }, + "7": { + "count": 3500 + }, + "2": { + "count": 3500 + }, + "3": { + "count": 3500 + }, + "10": { + "count": 3500 + }, + "6": { + "count": 3500 + }, + "18": { + "count": 3500 + }, + "4": { + "count": 3500 + }, + "9": { + "count": 3500 + } + } } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SlovakHateSpeechClassification.json b/mteb/descriptive_stats/Classification/SlovakHateSpeechClassification.json index 23225ae22..63fcfd3e5 100644 --- a/mteb/descriptive_stats/Classification/SlovakHateSpeechClassification.json +++ b/mteb/descriptive_stats/Classification/SlovakHateSpeechClassification.json @@ -2,7 +2,11 @@ "test": { "num_samples": 1319, "number_of_characters": 122279, + "num_texts_in_train": 46, + "min_text_length": 8, "average_text_length": 92.70583775587566, + "max_text_length": 1584, + "unique_text": 1315, "unique_labels": 2, "labels": { "1": { @@ -12,5 +16,23 @@ "count": 959 } } + }, + "train": { + "num_samples": 11870, + "number_of_characters": 1130860, + "num_texts_in_train": null, + "min_text_length": 7, + "average_text_length": 95.27042965459141, + "max_text_length": 2112, + "unique_text": 11655, + "unique_labels": 2, + "labels": { + "1": { + "count": 3245 + }, + "0": { + "count": 8625 + } + } } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/ArXivHierarchicalClusteringP2P.json b/mteb/descriptive_stats/Clustering/ArXivHierarchicalClusteringP2P.json index 8a5118e0c..e6066a83c 100644 --- a/mteb/descriptive_stats/Clustering/ArXivHierarchicalClusteringP2P.json +++ b/mteb/descriptive_stats/Clustering/ArXivHierarchicalClusteringP2P.json @@ -2,8 +2,12 @@ "test": { "num_samples": 2048, "number_of_characters": 2065284, + "min_text_length": 103, "average_text_length": 1008.439453125, + "max_text_length": 2103, + "min_labels_per_text": 1, "average_labels_per_text": 1.46337890625, + "max_labels_per_text": 381, "unique_labels": 129, "labels": { "cs": { diff --git a/mteb/descriptive_stats/Clustering/BiorxivClusteringS2S.json b/mteb/descriptive_stats/Clustering/BiorxivClusteringS2S.json index f1dda7920..2d9a0a01b 100644 --- a/mteb/descriptive_stats/Clustering/BiorxivClusteringS2S.json +++ b/mteb/descriptive_stats/Clustering/BiorxivClusteringS2S.json @@ -2,8 +2,13 @@ "test": { "num_samples": 10, "number_of_characters": 75000, + "min_text_length": 5000, "average_text_length": 7500.0, + "max_text_length": 10000, + "unique_texts": 41555, + "min_labels_per_text": 1, "average_labels_per_text": 7500.0, + "max_labels_per_text": 14251, "unique_labels": 26, "labels": { "neuroscience": { diff --git a/mteb/descriptive_stats/Clustering/MedrxivClusteringP2P.v2.json b/mteb/descriptive_stats/Clustering/MedrxivClusteringP2P.v2.json new file mode 100644 index 000000000..0370d5147 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/MedrxivClusteringP2P.v2.json @@ -0,0 +1,168 @@ +{ + "test": { + "num_samples": 37500, + "number_of_characters": 74294927, + "min_text_length": 148, + "average_text_length": 1981.1980533333333, + "max_text_length": 38759, + "min_labels_per_text": 6, + "average_labels_per_text": 1.0, + "max_labels_per_text": 8830, + "unique_labels": 51, + "labels": { + "epidemiology": { + "count": 6656 + }, + "public and global health": { + "count": 3595 + }, + "oncology": { + "count": 845 + }, + "allergy and immunology": { + "count": 464 + }, + "orthopedics": { + "count": 104 + }, + "health informatics": { + "count": 1107 + }, + "occupational and environmental health": { + "count": 415 + }, + "infectious diseases": { + "count": 8830 + }, + "genetic and genomic medicine": { + "count": 1918 + }, + "health policy": { + "count": 527 + }, + "gastroenterology": { + "count": 343 + }, + "radiology and imaging": { + "count": 541 + }, + "pain medicine": { + "count": 121 + }, + "neurology": { + "count": 1773 + }, + "primary care research": { + "count": 232 + }, + "rheumatology": { + "count": 189 + }, + "endocrinology": { + "count": 419 + }, + "hematology": { + "count": 202 + }, + "addiction medicine": { + "count": 178 + }, + "pediatrics": { + "count": 589 + }, + "cardiovascular medicine": { + "count": 855 + }, + "obstetrics and gynecology": { + "count": 373 + }, + "health systems and quality improvement": { + "count": 491 + }, + "nephrology": { + "count": 241 + }, + "respiratory medicine": { + "count": 482 + }, + "geriatric medicine": { + "count": 169 + }, + "dentistry and oral medicine": { + "count": 159 + }, + "psychiatry and clinical psychology": { + "count": 1781 + }, + "nutrition": { + "count": 240 + }, + "intensive care and critical care medicine": { + "count": 368 + }, + "rehabilitation medicine and physical therapy": { + "count": 322 + }, + "otolaryngology": { + "count": 166 + }, + "nursing": { + "count": 93 + }, + "transplantation": { + "count": 118 + }, + "health economics": { + "count": 327 + }, + "sports medicine": { + "count": 180 + }, + "hiv aids": { + "count": 363 + }, + "dermatology": { + "count": 98 + }, + "pathology": { + "count": 223 + }, + "emergency medicine": { + "count": 191 + }, + "pharmacology and therapeutics": { + "count": 221 + }, + "ophthalmology": { + "count": 220 + }, + "medical ethics": { + "count": 46 + }, + "palliative medicine": { + "count": 45 + }, + "sexual and reproductive health": { + "count": 156 + }, + "medical education": { + "count": 203 + }, + "surgery": { + "count": 162 + }, + "urology": { + "count": 65 + }, + "anesthesia": { + "count": 72 + }, + "toxicology": { + "count": 16 + }, + "forensic medicine": { + "count": 6 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/MedrxivClusteringS2S.v2.json b/mteb/descriptive_stats/Clustering/MedrxivClusteringS2S.v2.json new file mode 100644 index 000000000..7b55ddd4d --- /dev/null +++ b/mteb/descriptive_stats/Clustering/MedrxivClusteringS2S.v2.json @@ -0,0 +1,168 @@ +{ + "test": { + "num_samples": 37500, + "number_of_characters": 4301276, + "min_text_length": 18, + "average_text_length": 114.70069333333333, + "max_text_length": 339, + "min_labels_per_text": 6, + "average_labels_per_text": 1.0, + "max_labels_per_text": 8830, + "unique_labels": 51, + "labels": { + "epidemiology": { + "count": 6656 + }, + "public and global health": { + "count": 3595 + }, + "oncology": { + "count": 845 + }, + "allergy and immunology": { + "count": 464 + }, + "orthopedics": { + "count": 104 + }, + "health informatics": { + "count": 1107 + }, + "occupational and environmental health": { + "count": 415 + }, + "infectious diseases": { + "count": 8830 + }, + "genetic and genomic medicine": { + "count": 1918 + }, + "health policy": { + "count": 527 + }, + "gastroenterology": { + "count": 343 + }, + "radiology and imaging": { + "count": 541 + }, + "pain medicine": { + "count": 121 + }, + "neurology": { + "count": 1773 + }, + "primary care research": { + "count": 232 + }, + "rheumatology": { + "count": 189 + }, + "endocrinology": { + "count": 419 + }, + "hematology": { + "count": 202 + }, + "addiction medicine": { + "count": 178 + }, + "pediatrics": { + "count": 589 + }, + "cardiovascular medicine": { + "count": 855 + }, + "obstetrics and gynecology": { + "count": 373 + }, + "health systems and quality improvement": { + "count": 491 + }, + "nephrology": { + "count": 241 + }, + "respiratory medicine": { + "count": 482 + }, + "geriatric medicine": { + "count": 169 + }, + "dentistry and oral medicine": { + "count": 159 + }, + "psychiatry and clinical psychology": { + "count": 1781 + }, + "nutrition": { + "count": 240 + }, + "intensive care and critical care medicine": { + "count": 368 + }, + "rehabilitation medicine and physical therapy": { + "count": 322 + }, + "otolaryngology": { + "count": 166 + }, + "nursing": { + "count": 93 + }, + "transplantation": { + "count": 118 + }, + "health economics": { + "count": 327 + }, + "sports medicine": { + "count": 180 + }, + "hiv aids": { + "count": 363 + }, + "dermatology": { + "count": 98 + }, + "pathology": { + "count": 223 + }, + "emergency medicine": { + "count": 191 + }, + "pharmacology and therapeutics": { + "count": 221 + }, + "ophthalmology": { + "count": 220 + }, + "medical ethics": { + "count": 46 + }, + "palliative medicine": { + "count": 45 + }, + "sexual and reproductive health": { + "count": 156 + }, + "medical education": { + "count": 203 + }, + "surgery": { + "count": 162 + }, + "urology": { + "count": 65 + }, + "anesthesia": { + "count": 72 + }, + "toxicology": { + "count": 16 + }, + "forensic medicine": { + "count": 6 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/RedditClusteringP2P.v2.json b/mteb/descriptive_stats/Clustering/RedditClusteringP2P.v2.json new file mode 100644 index 000000000..ba997dbef --- /dev/null +++ b/mteb/descriptive_stats/Clustering/RedditClusteringP2P.v2.json @@ -0,0 +1,1335 @@ +{ + "test": { + "num_samples": 459389, + "number_of_characters": 334286895, + "min_text_length": 79, + "average_text_length": 727.6771864367671, + "max_text_length": 4359, + "min_labels_per_text": 2, + "average_labels_per_text": 1.0, + "max_labels_per_text": 77908, + "unique_labels": 440, + "labels": { + "FortNiteBR": { + "count": 436 + }, + "buildapc": { + "count": 8484 + }, + "offmychest": { + "count": 570 + }, + "nus": { + "count": 45 + }, + "relationship_advice": { + "count": 16651 + }, + "premed": { + "count": 201 + }, + "dogecoin": { + "count": 8108 + }, + "GamingLaptops": { + "count": 183 + }, + "asktransgender": { + "count": 326 + }, + "MachineLearning": { + "count": 61 + }, + "puppy101": { + "count": 1597 + }, + "GunAccessoriesForSale": { + "count": 2619 + }, + "Random_Acts_Of_Amazon": { + "count": 1115 + }, + "Catholicism": { + "count": 183 + }, + "MonsterHunter": { + "count": 218 + }, + "tipofmypenis": { + "count": 87 + }, + "samsung": { + "count": 69 + }, + "PersonalFinanceCanada": { + "count": 341 + }, + "Dyson_Sphere_Program": { + "count": 55 + }, + "bleach": { + "count": 41 + }, + "AmItheAsshole": { + "count": 3730 + }, + "WallStreetbetsELITE": { + "count": 328 + }, + "GlobalPowers": { + "count": 35 + }, + "ABraThatFits": { + "count": 159 + }, + "PokemonGoFriends": { + "count": 1165 + }, + "NoMansSkyTheGame": { + "count": 259 + }, + "masseffect": { + "count": 233 + }, + "dating_advice": { + "count": 559 + }, + "yoga": { + "count": 50 + }, + "depression": { + "count": 515 + }, + "COVID19positive": { + "count": 180 + }, + "generationology": { + "count": 37 + }, + "feedthebeast": { + "count": 192 + }, + "EliteDangerous": { + "count": 270 + }, + "alcoholicsanonymous": { + "count": 93 + }, + "GoRVing": { + "count": 35 + }, + "thedivision": { + "count": 111 + }, + "breakingmom": { + "count": 105 + }, + "AskAnAmerican": { + "count": 80 + }, + "HypnoFair": { + "count": 5 + }, + "JustUnsubbed": { + "count": 13 + }, + "socialanxiety": { + "count": 123 + }, + "dirtykikpals": { + "count": 202 + }, + "askTO": { + "count": 126 + }, + "AskCulinary": { + "count": 108 + }, + "Bogleheads": { + "count": 71 + }, + "dragonquest": { + "count": 45 + }, + "NoContract": { + "count": 30 + }, + "gorillaz": { + "count": 14 + }, + "MondoGore": { + "count": 8 + }, + "comicswap": { + "count": 56 + }, + "VirtualYoutubers": { + "count": 92 + }, + "Gta5Modding": { + "count": 28 + }, + "obs": { + "count": 61 + }, + "vcu": { + "count": 9 + }, + "KingkillerChronicle": { + "count": 17 + }, + "AmongUs": { + "count": 41 + }, + "wireshark": { + "count": 3 + }, + "Dodocodes": { + "count": 46 + }, + "Aliexpress": { + "count": 40 + }, + "LearnerDriverUK": { + "count": 12 + }, + "PanicAttack": { + "count": 23 + }, + "KassadinMains": { + "count": 10 + }, + "islam": { + "count": 93 + }, + "chronotrigger": { + "count": 4 + }, + "skincareexchange": { + "count": 13 + }, + "PokemonHome": { + "count": 21 + }, + "survivinginfidelity": { + "count": 71 + }, + "igcse": { + "count": 21 + }, + "C25K": { + "count": 21 + }, + "aorus": { + "count": 2 + }, + "idleon": { + "count": 19 + }, + "photography": { + "count": 22 + }, + "cryptocoins": { + "count": 7 + }, + "CanaryWharfBets": { + "count": 7 + }, + "KillingEve": { + "count": 7 + }, + "GameBuilderGarage": { + "count": 16 + }, + "SauceSharingCommunity": { + "count": 7 + }, + "turo": { + "count": 9 + }, + "foodscience": { + "count": 14 + }, + "HIMYM": { + "count": 20 + }, + "HauntingOfHillHouse": { + "count": 4 + }, + "GoodNotes": { + "count": 8 + }, + "RedditWritesSeinfeld": { + "count": 6 + }, + "AirReps": { + "count": 2 + }, + "ADHD": { + "count": 3811 + }, + "BuddyCrossing": { + "count": 446 + }, + "libraryofruina": { + "count": 98 + }, + "SluttyConfessions": { + "count": 2787 + }, + "tipofmytongue": { + "count": 7145 + }, + "fleshlight": { + "count": 128 + }, + "amcstock": { + "count": 13910 + }, + "teenagers": { + "count": 77908 + }, + "suggestmeabook": { + "count": 1540 + }, + "dirtypenpals": { + "count": 5587 + }, + "MinecraftServer": { + "count": 177 + }, + "CreditCards": { + "count": 669 + }, + "Guitar": { + "count": 10952 + }, + "rpg": { + "count": 529 + }, + "NoFap": { + "count": 14853 + }, + "lfg": { + "count": 1093 + }, + "MarsWallStreet": { + "count": 935 + }, + "SummonSign": { + "count": 931 + }, + "AssassinsCreedValhala": { + "count": 295 + }, + "hoi4": { + "count": 432 + }, + "Coins4Sale": { + "count": 260 + }, + "xbox": { + "count": 459 + }, + "TooAfraidToAsk": { + "count": 7404 + }, + "NBA2k": { + "count": 553 + }, + "KGBTR": { + "count": 943 + }, + "roblox": { + "count": 220 + }, + "salesforce": { + "count": 214 + }, + "TwoXChromosomes": { + "count": 1736 + }, + "mechmarket": { + "count": 4863 + }, + "Gaming_Headsets": { + "count": 103 + }, + "pittsburgh": { + "count": 189 + }, + "CryptoMars": { + "count": 1606 + }, + "FridayNightFunkin": { + "count": 378 + }, + "vaginismus": { + "count": 122 + }, + "transpositive": { + "count": 10 + }, + "comicbooks": { + "count": 274 + }, + "BDSMcommunity": { + "count": 185 + }, + "aliens": { + "count": 201 + }, + "Scotch": { + "count": 64 + }, + "KikRoleplay": { + "count": 141 + }, + "Kayaking": { + "count": 91 + }, + "196": { + "count": 47 + }, + "digimon": { + "count": 140 + }, + "Evernote": { + "count": 42 + }, + "logh": { + "count": 22 + }, + "arlington": { + "count": 15 + }, + "Adopted": { + "count": 8 + }, + "DissonautUniverse": { + "count": 4 + }, + "Midsommar": { + "count": 12 + }, + "SofiawithanF": { + "count": 83 + }, + "xmpp": { + "count": 6 + }, + "ZombsRoyale": { + "count": 16 + }, + "accesscontrol": { + "count": 8 + }, + "WetlanderHumor": { + "count": 2 + }, + "PoonamPandeyFanatics": { + "count": 2 + }, + "screenplaychallenge": { + "count": 2 + }, + "scatstories": { + "count": 2 + }, + "techsupport": { + "count": 290 + }, + "whatcarshouldIbuy": { + "count": 79 + }, + "Stormlight_Archive": { + "count": 15 + }, + "deadbydaylight": { + "count": 126 + }, + "bicycling": { + "count": 27 + }, + "oculus": { + "count": 64 + }, + "Cartalk": { + "count": 33 + }, + "Sims4": { + "count": 43 + }, + "NoFeeAC": { + "count": 95 + }, + "Crypto_com": { + "count": 37 + }, + "ITCareerQuestions": { + "count": 259 + }, + "aromantic": { + "count": 18 + }, + "Revu": { + "count": 3 + }, + "exalted": { + "count": 2 + }, + "HilariaBaldwin": { + "count": 20 + }, + "Testosterone": { + "count": 35 + }, + "Screenwriting": { + "count": 170 + }, + "LifeProTips": { + "count": 49 + }, + "steinsgate": { + "count": 13 + }, + "Baystreetbets": { + "count": 10 + }, + "AskGirls": { + "count": 7 + }, + "idlechampions": { + "count": 7 + }, + "facebook": { + "count": 17 + }, + "tf2trade": { + "count": 4 + }, + "mfdoom": { + "count": 3 + }, + "FiddlesticksMains": { + "count": 2 + }, + "HFY": { + "count": 10 + }, + "FiestaST": { + "count": 2 + }, + "whatsthatbook": { + "count": 994 + }, + "GearsOfWar": { + "count": 879 + }, + "KazuhaMains": { + "count": 175 + }, + "RepTime": { + "count": 211 + }, + "AstroGaming": { + "count": 141 + }, + "metalgearsolid": { + "count": 152 + }, + "qBittorrent": { + "count": 39 + }, + "ELLIPAL_Official": { + "count": 24 + }, + "raisedbynarcissists": { + "count": 4895 + }, + "unpopularopinion": { + "count": 14901 + }, + "ACTrade": { + "count": 5679 + }, + "askcarsales": { + "count": 1339 + }, + "AskVet": { + "count": 1357 + }, + "whowouldwin": { + "count": 4493 + }, + "playstation": { + "count": 1362 + }, + "anime": { + "count": 6531 + }, + "GME": { + "count": 12577 + }, + "DotA2": { + "count": 2004 + }, + "cryptostreetbets": { + "count": 2241 + }, + "MonsterHunterWorld": { + "count": 698 + }, + "Market76": { + "count": 14274 + }, + "DnD": { + "count": 5092 + }, + "leagueoflegends": { + "count": 3683 + }, + "doordash_drivers": { + "count": 1626 + }, + "theta_network": { + "count": 489 + }, + "exmuslim": { + "count": 1369 + }, + "gonewildaudio": { + "count": 2998 + }, + "conspiracy": { + "count": 3587 + }, + "heroesofthestorm": { + "count": 535 + }, + "FanFiction": { + "count": 2782 + }, + "Doom": { + "count": 1251 + }, + "texas": { + "count": 269 + }, + "Vent": { + "count": 1738 + }, + "selfimprovement": { + "count": 1284 + }, + "youtubers": { + "count": 706 + }, + "askseddit": { + "count": 237 + }, + "boardgames": { + "count": 1237 + }, + "bravelydefault": { + "count": 347 + }, + "ConquerorsBlade": { + "count": 238 + }, + "ChronicPain": { + "count": 527 + }, + "teenagersnew": { + "count": 256 + }, + "brasil": { + "count": 1092 + }, + "MatthiasSubmissions": { + "count": 921 + }, + "MarylandUnemployment": { + "count": 314 + }, + "SaltLakeCity": { + "count": 411 + }, + "BokunoheroFanfiction": { + "count": 155 + }, + "BenignExistence": { + "count": 125 + }, + "GayYoungOldDating": { + "count": 156 + }, + "Bible": { + "count": 202 + }, + "haskell": { + "count": 154 + }, + "seduction": { + "count": 400 + }, + "fantasywriters": { + "count": 262 + }, + "HiveOS": { + "count": 100 + }, + "PerkByDaylight": { + "count": 15 + }, + "Hedgehog": { + "count": 73 + }, + "xmen": { + "count": 263 + }, + "HyperRP": { + "count": 122 + }, + "emotestories": { + "count": 3 + }, + "tutanota": { + "count": 135 + }, + "CultoftheFranklin": { + "count": 46 + }, + "langrisser": { + "count": 62 + }, + "CozyGrove": { + "count": 61 + }, + "Sverigesforsvarsmakt": { + "count": 12 + }, + "silverbugbets": { + "count": 21 + }, + "WreckingBallMains": { + "count": 5 + }, + "capitalism_in_decay": { + "count": 8 + }, + "paintdotnet": { + "count": 11 + }, + "u_mawadom118": { + "count": 4 + }, + "xboxfindfriends": { + "count": 2 + }, + "CPTSD": { + "count": 540 + }, + "destiny2": { + "count": 318 + }, + "Wallstreetsilver": { + "count": 1013 + }, + "DestinyTheGame": { + "count": 1107 + }, + "blackopscoldwar": { + "count": 400 + }, + "InstacartShoppers": { + "count": 202 + }, + "RocketLeagueExchange": { + "count": 832 + }, + "apexlegends": { + "count": 3265 + }, + "kansascity": { + "count": 53 + }, + "namenerds": { + "count": 235 + }, + "help": { + "count": 152 + }, + "Kengan_Ashura": { + "count": 132 + }, + "thetagang": { + "count": 165 + }, + "GameSale": { + "count": 262 + }, + "Reduction": { + "count": 109 + }, + "sex": { + "count": 906 + }, + "bostonr4r": { + "count": 75 + }, + "LegendsOfRuneterra": { + "count": 231 + }, + "overlord": { + "count": 48 + }, + "madisonwi": { + "count": 53 + }, + "steelseries": { + "count": 79 + }, + "ClashOfClansRecruit": { + "count": 214 + }, + "CharacterRant": { + "count": 55 + }, + "AirForce": { + "count": 94 + }, + "sexstories": { + "count": 92 + }, + "NameThatSong": { + "count": 162 + }, + "depressed": { + "count": 74 + }, + "ibs": { + "count": 150 + }, + "40kLore": { + "count": 269 + }, + "podcasts": { + "count": 88 + }, + "miraculousladybug": { + "count": 150 + }, + "ask": { + "count": 224 + }, + "EverMerge": { + "count": 31 + }, + "TMJ": { + "count": 54 + }, + "BitLifeApp": { + "count": 39 + }, + "FireEmblemHeroes": { + "count": 100 + }, + "software": { + "count": 62 + }, + "ShieldAndroidTV": { + "count": 70 + }, + "GriefSupport": { + "count": 125 + }, + "onewheel": { + "count": 37 + }, + "MensRights": { + "count": 80 + }, + "nhl": { + "count": 22 + }, + "ClashOfClans": { + "count": 107 + }, + "ps3homebrew": { + "count": 33 + }, + "LightNovels": { + "count": 77 + }, + "redsox": { + "count": 34 + }, + "CryptoMarkets": { + "count": 44 + }, + "ugly": { + "count": 47 + }, + "GCXRep": { + "count": 12 + }, + "cscareerquestionsEU": { + "count": 65 + }, + "MindHunter": { + "count": 6 + }, + "starcraft2coop": { + "count": 15 + }, + "nanocurrency": { + "count": 1421 + }, + "ModelCars": { + "count": 8 + }, + "UKJobs": { + "count": 30 + }, + "Netherlands": { + "count": 44 + }, + "clonewars": { + "count": 8 + }, + "Julia": { + "count": 11 + }, + "Prolactinoma": { + "count": 9 + }, + "sofi": { + "count": 11 + }, + "royalfamily": { + "count": 6 + }, + "ConnecticutR4R": { + "count": 8 + }, + "weather": { + "count": 5 + }, + "oneui": { + "count": 7 + }, + "KTM": { + "count": 5 + }, + "Aerials": { + "count": 3 + }, + "seoul": { + "count": 2 + }, + "exjw": { + "count": 3281 + }, + "ModernMagic": { + "count": 699 + }, + "Paladins": { + "count": 1242 + }, + "kdramarecommends": { + "count": 1611 + }, + "hitbtc": { + "count": 330 + }, + "endocrinology": { + "count": 75 + }, + "Bath": { + "count": 43 + }, + "NassauCountyHookups": { + "count": 5 + }, + "feminineboys": { + "count": 1248 + }, + "dreamsmp": { + "count": 2018 + }, + "SquaredCircle": { + "count": 2255 + }, + "Minecraft": { + "count": 8753 + }, + "spirituality": { + "count": 1809 + }, + "Eldenring": { + "count": 1471 + }, + "Sat": { + "count": 1172 + }, + "bonnaroo": { + "count": 194 + }, + "gardening": { + "count": 1892 + }, + "Unemployment": { + "count": 6185 + }, + "mac": { + "count": 1847 + }, + "Bestbuy": { + "count": 437 + }, + "quittingkratom": { + "count": 1081 + }, + "lawschooladmissions": { + "count": 3436 + }, + "NiceHash": { + "count": 2135 + }, + "McMaster": { + "count": 815 + }, + "covidlonghaulers": { + "count": 1299 + }, + "stalker": { + "count": 758 + }, + "MLBTheShow": { + "count": 2721 + }, + "FortniteCompetitive": { + "count": 998 + }, + "dpdr": { + "count": 514 + }, + "appliancerepair": { + "count": 720 + }, + "thomasthetankengine": { + "count": 207 + }, + "delhi": { + "count": 217 + }, + "Huel": { + "count": 300 + }, + "leafs": { + "count": 203 + }, + "HotWheels": { + "count": 170 + }, + "90dayfianceuncensored": { + "count": 550 + }, + "Throwers": { + "count": 142 + }, + "Wavyhair": { + "count": 270 + }, + "CryptoHorde": { + "count": 128 + }, + "ShuumatsuNoValkyrie": { + "count": 453 + }, + "TeensMeetTeens": { + "count": 432 + }, + "dbrand": { + "count": 108 + }, + "SLFmeetups": { + "count": 18 + }, + "1200isplentyketo": { + "count": 48 + }, + "passive_income": { + "count": 211 + }, + "BroadCity": { + "count": 16 + }, + "RevenantMain": { + "count": 71 + }, + "extrarfl": { + "count": 25 + }, + "AgonGame": { + "count": 5 + }, + "FitnessDE": { + "count": 3 + }, + "gaming": { + "count": 1277 + }, + "livesound": { + "count": 91 + }, + "IBO": { + "count": 1896 + }, + "EscapefromTarkov": { + "count": 1300 + }, + "amex": { + "count": 145 + }, + "DMAcademy": { + "count": 1411 + }, + "VinylCollectors": { + "count": 556 + }, + "cardano": { + "count": 716 + }, + "brave_browser": { + "count": 159 + }, + "dating": { + "count": 952 + }, + "OculusQuest": { + "count": 942 + }, + "Superstonk": { + "count": 3089 + }, + "MtF": { + "count": 957 + }, + "findaleague": { + "count": 207 + }, + "Nioh": { + "count": 398 + }, + "IRS": { + "count": 715 + }, + "transgendercirclejerk": { + "count": 353 + }, + "learnmath": { + "count": 489 + }, + "piano": { + "count": 263 + }, + "LeagueConnect": { + "count": 216 + }, + "eu4": { + "count": 561 + }, + "Wordpress": { + "count": 345 + }, + "RoleplayingForReddit": { + "count": 31 + }, + "LOONA": { + "count": 89 + }, + "newtothenavy": { + "count": 167 + }, + "HaircareScience": { + "count": 118 + }, + "appletv": { + "count": 167 + }, + "sissypersonals": { + "count": 102 + }, + "raleigh": { + "count": 168 + }, + "realonlyfansreviews": { + "count": 21 + }, + "AskGames": { + "count": 49 + }, + "PokemonTCG": { + "count": 325 + }, + "controlgame": { + "count": 109 + }, + "GoogleDataStudio": { + "count": 16 + }, + "WhiteWolfRPG": { + "count": 139 + }, + "MECoOp": { + "count": 31 + }, + "snuffrp": { + "count": 46 + }, + "lockpicking": { + "count": 103 + }, + "wicked_edge": { + "count": 105 + }, + "BMW": { + "count": 99 + }, + "choiceofgames": { + "count": 24 + }, + "hisdarkmaterials": { + "count": 12 + }, + "SakuraGakuin": { + "count": 24 + }, + "detrans": { + "count": 55 + }, + "Smallville": { + "count": 37 + }, + "kingofqueens": { + "count": 7 + }, + "JamesHoffmann": { + "count": 22 + }, + "stashinvest": { + "count": 16 + }, + "ABA": { + "count": 79 + }, + "ladybusiness": { + "count": 10 + }, + "gamegrumps": { + "count": 32 + }, + "GodEater": { + "count": 21 + }, + "tomorrow": { + "count": 39 + }, + "Tomorrowland": { + "count": 9 + }, + "BlackCountryNewRoad": { + "count": 5 + }, + "STAYC": { + "count": 3 + }, + "SatoshiStreetBets": { + "count": 3828 + }, + "AskLosAngeles": { + "count": 1036 + }, + "buildapcforme": { + "count": 1689 + }, + "ApplyingToCollege": { + "count": 10675 + }, + "watercooling": { + "count": 1209 + }, + "BreakUps": { + "count": 4914 + }, + "FIFA": { + "count": 3811 + }, + "emacs": { + "count": 712 + }, + "trakstocks": { + "count": 691 + }, + "Shittyaskflying": { + "count": 147 + }, + "AmazonFC": { + "count": 1178 + }, + "stocks": { + "count": 4610 + }, + "BangaloreMains": { + "count": 26 + }, + "pokemon": { + "count": 3953 + }, + "religion": { + "count": 684 + }, + "cuboulder": { + "count": 269 + }, + "self": { + "count": 1688 + }, + "tarot": { + "count": 912 + }, + "turtles": { + "count": 49 + }, + "TheMagnusArchives": { + "count": 300 + }, + "Superhero_Ideas": { + "count": 34 + }, + "NTU": { + "count": 308 + }, + "touhou": { + "count": 623 + }, + "JoJolion": { + "count": 50 + }, + "lasers": { + "count": 27 + }, + "popperpigs": { + "count": 67 + }, + "aggretsuko": { + "count": 20 + }, + "Library": { + "count": 5 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/RuSciBenchGRNTIClusteringP2P.json b/mteb/descriptive_stats/Clustering/RuSciBenchGRNTIClusteringP2P.json index 9eff1b40d..126cd893b 100644 --- a/mteb/descriptive_stats/Clustering/RuSciBenchGRNTIClusteringP2P.json +++ b/mteb/descriptive_stats/Clustering/RuSciBenchGRNTIClusteringP2P.json @@ -2,8 +2,12 @@ "test": { "num_samples": 2048, "number_of_characters": 1822339, + "min_text_length": 84, "average_text_length": 889.81396484375, + "max_text_length": 3143, + "min_labels_per_text": 73, "average_labels_per_text": 1.0, + "max_labels_per_text": 74, "unique_labels": 28, "labels": { "3": { diff --git a/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering.v2.json b/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering.v2.json new file mode 100644 index 000000000..77be5a3b7 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering.v2.json @@ -0,0 +1,75 @@ +{ + "test": { + "num_samples": 59545, + "number_of_characters": 1907719, + "min_text_length": 11, + "average_text_length": 32.03827357460744, + "max_text_length": 120, + "min_labels_per_text": 2082, + "average_labels_per_text": 1.0, + "max_labels_per_text": 3236, + "unique_labels": 20, + "labels": { + "12": { + "count": 3137 + }, + "6": { + "count": 3070 + }, + "0": { + "count": 2613 + }, + "2": { + "count": 3155 + }, + "10": { + "count": 3220 + }, + "17": { + "count": 2986 + }, + "14": { + "count": 3106 + }, + "13": { + "count": 3055 + }, + "1": { + "count": 3056 + }, + "16": { + "count": 2911 + }, + "9": { + "count": 2984 + }, + "3": { + "count": 3070 + }, + "15": { + "count": 3090 + }, + "7": { + "count": 3036 + }, + "5": { + "count": 3124 + }, + "11": { + "count": 3236 + }, + "18": { + "count": 2483 + }, + "8": { + "count": 3090 + }, + "19": { + "count": 2082 + }, + "4": { + "count": 3041 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/WikiClusteringP2P.json b/mteb/descriptive_stats/Clustering/WikiClusteringP2P.json index 99b033bce..4c1f30309 100644 --- a/mteb/descriptive_stats/Clustering/WikiClusteringP2P.json +++ b/mteb/descriptive_stats/Clustering/WikiClusteringP2P.json @@ -2,8 +2,13 @@ "test": { "num_samples": 140, "number_of_characters": 71680, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 49704, + "min_labels_per_text": 1, "average_labels_per_text": 512.0, + "max_labels_per_text": 3986, "unique_labels": 282, "labels": { "Nauke": { @@ -857,8 +862,13 @@ "bs": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 3860, + "min_labels_per_text": 6, "average_labels_per_text": 512.0, + "max_labels_per_text": 1492, "unique_labels": 17, "labels": { "Nauke": { @@ -917,8 +927,13 @@ "ca": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 4596, + "min_labels_per_text": 20, "average_labels_per_text": 512.0, + "max_labels_per_text": 1844, "unique_labels": 8, "labels": { "Llocs": { @@ -950,8 +965,13 @@ "cs": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 4782, + "min_labels_per_text": 21, "average_labels_per_text": 512.0, + "max_labels_per_text": 1559, "unique_labels": 21, "labels": { "Lid\u00c3\u00a9": { @@ -1022,8 +1042,13 @@ "da": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 4725, + "min_labels_per_text": 35, "average_labels_per_text": 512.0, + "max_labels_per_text": 911, "unique_labels": 20, "labels": { "Natur": { @@ -1091,8 +1116,13 @@ "eu": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 4474, + "min_labels_per_text": 110, "average_labels_per_text": 512.0, + "max_labels_per_text": 2486, "unique_labels": 5, "labels": { "Entitateak": { @@ -1115,8 +1145,13 @@ "gv": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 2717, + "min_labels_per_text": 2, "average_labels_per_text": 512.0, + "max_labels_per_text": 1334, "unique_labels": 28, "labels": { "Chron-oaylleeaght": { @@ -1208,8 +1243,13 @@ "ilo": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 2258, + "min_labels_per_text": 1, "average_labels_per_text": 512.0, + "max_labels_per_text": 1405, "unique_labels": 34, "labels": { "Katutubo": { @@ -1319,8 +1359,13 @@ "ku": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 3365, + "min_labels_per_text": 5, "average_labels_per_text": 512.0, + "max_labels_per_text": 1078, "unique_labels": 39, "labels": { "Kes": { @@ -1445,8 +1490,13 @@ "lv": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 4540, + "min_labels_per_text": 13, "average_labels_per_text": 512.0, + "max_labels_per_text": 878, "unique_labels": 16, "labels": { "Kult\u00c5\u00abra": { @@ -1502,8 +1552,13 @@ "min": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 3881, + "min_labels_per_text": 1, "average_labels_per_text": 512.0, + "max_labels_per_text": 3986, "unique_labels": 16, "labels": { "Makaluak_iduik": { @@ -1559,8 +1614,13 @@ "mt": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 1887, + "min_labels_per_text": 2, "average_labels_per_text": 512.0, + "max_labels_per_text": 1634, "unique_labels": 27, "labels": { "\u00c4\u00a0eografija": { @@ -1649,8 +1709,13 @@ "sco": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 2605, + "min_labels_per_text": 3, "average_labels_per_text": 512.0, + "max_labels_per_text": 1081, "unique_labels": 23, "labels": { "Life": { @@ -1727,8 +1792,13 @@ "sq": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 3741, + "min_labels_per_text": 2, "average_labels_per_text": 512.0, + "max_labels_per_text": 1109, "unique_labels": 36, "labels": { "Gjeografi": { @@ -1844,8 +1914,13 @@ "wa": { "num_samples": 10, "number_of_characters": 5120, + "min_text_length": 512, "average_text_length": 512.0, + "max_text_length": 512, + "unique_texts": 2317, + "min_labels_per_text": 2, "average_labels_per_text": 512.0, + "max_labels_per_text": 3653, "unique_labels": 6, "labels": { "Economeye": { diff --git a/mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json b/mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json index 8a912bee4..897b23d7c 100644 --- a/mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json +++ b/mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json @@ -4,11 +4,27 @@ "num_docs": 19899, "num_queries": 20, "number_of_characters": 44450333, + "min_document_length": 7, "average_document_length": 2233.0329664807277, + "max_document_length": 2959, + "unique_docs": 19143, + "min_query_length": 55, "average_query_length": 109.75, + "max_query_length": 278, + "unique_queries": 20, + "min_instruction_length": 102, "average_instruction_length": 295.55, + "max_instruction_length": 811, + "unique_instructions": 20, + "min_changed_instruction_length": 151, "average_changed_instruction_length": 355.2, + "max_changed_instruction_length": 837, + "unique_changed_instructions": 20, + "min_average_relevant_docs_per_query": 4, "average_relevant_docs_per_query": 32.7, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 55, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/MultilabelClassification/CEDRClassification.json b/mteb/descriptive_stats/MultilabelClassification/CEDRClassification.json index 2120a1113..d5d91adf5 100644 --- a/mteb/descriptive_stats/MultilabelClassification/CEDRClassification.json +++ b/mteb/descriptive_stats/MultilabelClassification/CEDRClassification.json @@ -1,9 +1,15 @@ { "test": { - "average_text_length": 91.20563230605738, + "num_samples": 1882, "number_of_characters": 171649, + "number_texts_in_train": 7, + "min_text_length": 6, + "average_text_length": 91.20563230605738, + "max_text_length": 220, + "unique_texts": 1875, + "min_labels_per_text": 0, "average_label_per_text": 0.620616365568544, - "num_samples": 1882, + "max_labels_per_text": 2, "unique_labels": 6, "labels": { "None": { @@ -25,5 +31,38 @@ "count": 125 } } + }, + "train": { + "num_samples": 7528, + "number_of_characters": 697322, + "number_texts_in_train": null, + "min_text_length": 5, + "average_text_length": 92.63044633368757, + "max_text_length": 280, + "unique_texts": 7500, + "min_labels_per_text": 0, + "average_label_per_text": 0.6101222104144527, + "max_labels_per_text": 3, + "unique_labels": 6, + "labels": { + "None": { + "count": 3043 + }, + "2": { + "count": 607 + }, + "0": { + "count": 1569 + }, + "3": { + "count": 589 + }, + "1": { + "count": 1417 + }, + "4": { + "count": 411 + } + } } } \ No newline at end of file diff --git a/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json b/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json deleted file mode 100644 index 2f4f979d0..000000000 --- a/mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json +++ /dev/null @@ -1,1732 +0,0 @@ -{ - "test": { - "average_text_length": 12014.408930434782, - "number_of_characters": 1381657027, - "average_label_per_text": 3.5938, - "num_samples": 115000, - "unique_labels": 21, - "labels": { - "18": { - "count": 50784 - }, - "15": { - "count": 30981 - }, - "5": { - "count": 24978 - }, - "6": { - "count": 45080 - }, - "3": { - "count": 63687 - }, - "17": { - "count": 37743 - }, - "1": { - "count": 15019 - }, - "20": { - "count": 14030 - }, - "0": { - "count": 17802 - }, - "2": { - "count": 22402 - }, - "19": { - "count": 10212 - }, - "9": { - "count": 3772 - }, - "4": { - "count": 9062 - }, - "10": { - "count": 7705 - }, - "11": { - "count": 12213 - }, - "7": { - "count": 14306 - }, - "12": { - "count": 11799 - }, - "8": { - "count": 13800 - }, - "13": { - "count": 2346 - }, - "14": { - "count": 4255 - }, - "16": { - "count": 1311 - } - }, - "hf_subset_descriptive_stats": { - "en": { - "average_text_length": 11720.2926, - "number_of_characters": 58601463, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "de": { - "average_text_length": 12865.4162, - "number_of_characters": 64327081, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "fr": { - "average_text_length": 13081.1098, - "number_of_characters": 65405549, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "it": { - "average_text_length": 12763.4786, - "number_of_characters": 63817393, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "es": { - "average_text_length": 13080.29, - "number_of_characters": 65401450, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "pl": { - "average_text_length": 12282.5926, - "number_of_characters": 61412963, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "ro": { - "average_text_length": 12836.9322, - "number_of_characters": 64184661, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "nl": { - "average_text_length": 12857.9742, - "number_of_characters": 64289871, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "el": { - "average_text_length": 12998.143, - "number_of_characters": 64990715, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "hu": { - "average_text_length": 12424.641, - "number_of_characters": 62123205, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "pt": { - "average_text_length": 12482.4616, - "number_of_characters": 62412308, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "cs": { - "average_text_length": 10783.4676, - "number_of_characters": 53917338, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "sv": { - "average_text_length": 11612.4774, - "number_of_characters": 58062387, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "bg": { - "average_text_length": 12235.4268, - "number_of_characters": 61177134, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "da": { - "average_text_length": 11773.958, - "number_of_characters": 58869790, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "fi": { - "average_text_length": 12087.6862, - "number_of_characters": 60438431, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "sk": { - "average_text_length": 11130.814, - "number_of_characters": 55654070, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "lt": { - "average_text_length": 11245.3566, - "number_of_characters": 56226783, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "hr": { - "average_text_length": 11022.142, - "number_of_characters": 55110710, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "sl": { - "average_text_length": 10620.0594, - "number_of_characters": 53100297, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "et": { - "average_text_length": 10898.4312, - "number_of_characters": 54492156, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "lv": { - "average_text_length": 10938.5102, - "number_of_characters": 54692551, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - }, - "mt": { - "average_text_length": 12589.7442, - "number_of_characters": 62948721, - "average_label_per_text": 3.5938, - "num_samples": 5000, - "unique_labels": 21, - "labels": { - "18": { - "count": 2208 - }, - "15": { - "count": 1347 - }, - "5": { - "count": 1086 - }, - "6": { - "count": 1960 - }, - "3": { - "count": 2769 - }, - "17": { - "count": 1641 - }, - "1": { - "count": 653 - }, - "20": { - "count": 610 - }, - "0": { - "count": 774 - }, - "2": { - "count": 974 - }, - "19": { - "count": 444 - }, - "9": { - "count": 164 - }, - "4": { - "count": 394 - }, - "10": { - "count": 335 - }, - "11": { - "count": 531 - }, - "7": { - "count": 622 - }, - "12": { - "count": 513 - }, - "8": { - "count": 600 - }, - "13": { - "count": 102 - }, - "14": { - "count": 185 - }, - "16": { - "count": 57 - } - } - } - } - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PawsXPairClassification.json b/mteb/descriptive_stats/PairClassification/PawsXPairClassification.json index 63180983c..849724bdb 100644 --- a/mteb/descriptive_stats/PairClassification/PawsXPairClassification.json +++ b/mteb/descriptive_stats/PairClassification/PawsXPairClassification.json @@ -2,8 +2,14 @@ "test": { "num_samples": 14000, "number_of_characters": 2551922, - "avg_sentence1_len": 91.17892857142857, - "avg_sentence2_len": 91.10121428571429, + "min_sentence1_length": 2, + "avg_sentence1_length": 91.17892857142857, + "max_sentence1_length": 268, + "unique_sentence1": 13404, + "min_sentence2_length": 2, + "avg_sentence2_length": 91.10121428571429, + "max_sentence2_length": 247, + "unique_sentence2": 13462, "unique_labels": 2, "labels": { "1": { @@ -17,8 +23,14 @@ "de": { "num_samples": 2000, "number_of_characters": 478034, - "avg_sentence1_len": 119.7815, - "avg_sentence2_len": 119.2355, + "min_sentence1_length": 2, + "avg_sentence1_length": 119.7815, + "max_sentence1_length": 268, + "unique_sentence1": 1934, + "min_sentence2_length": 2, + "avg_sentence2_length": 119.2355, + "max_sentence2_length": 235, + "unique_sentence2": 1938, "unique_labels": 2, "labels": { "1": { @@ -32,8 +44,14 @@ "en": { "num_samples": 2000, "number_of_characters": 454362, - "avg_sentence1_len": 113.7575, - "avg_sentence2_len": 113.4235, + "min_sentence1_length": 25, + "avg_sentence1_length": 113.7575, + "max_sentence1_length": 209, + "unique_sentence1": 1761, + "min_sentence2_length": 25, + "avg_sentence2_length": 113.4235, + "max_sentence2_length": 209, + "unique_sentence2": 1800, "unique_labels": 2, "labels": { "1": { @@ -47,8 +65,14 @@ "es": { "num_samples": 2000, "number_of_characters": 471226, - "avg_sentence1_len": 117.815, - "avg_sentence2_len": 117.798, + "min_sentence1_length": 2, + "avg_sentence1_length": 117.815, + "max_sentence1_length": 226, + "unique_sentence1": 1955, + "min_sentence2_length": 22, + "avg_sentence2_length": 117.798, + "max_sentence2_length": 233, + "unique_sentence2": 1959, "unique_labels": 2, "labels": { "1": { @@ -62,8 +86,14 @@ "fr": { "num_samples": 2000, "number_of_characters": 480033, - "avg_sentence1_len": 120.028, - "avg_sentence2_len": 119.9885, + "min_sentence1_length": 2, + "avg_sentence1_length": 120.028, + "max_sentence1_length": 238, + "unique_sentence1": 1954, + "min_sentence2_length": 2, + "avg_sentence2_length": 119.9885, + "max_sentence2_length": 247, + "unique_sentence2": 1953, "unique_labels": 2, "labels": { "1": { @@ -77,8 +107,14 @@ "ja": { "num_samples": 2000, "number_of_characters": 235106, - "avg_sentence1_len": 58.678, - "avg_sentence2_len": 58.875, + "min_sentence1_length": 2, + "avg_sentence1_length": 58.678, + "max_sentence1_length": 192, + "unique_sentence1": 1944, + "min_sentence2_length": 2, + "avg_sentence2_length": 58.875, + "max_sentence2_length": 198, + "unique_sentence2": 1941, "unique_labels": 2, "labels": { "1": { @@ -92,8 +128,14 @@ "ko": { "num_samples": 2000, "number_of_characters": 260149, - "avg_sentence1_len": 64.9605, - "avg_sentence2_len": 65.114, + "min_sentence1_length": 2, + "avg_sentence1_length": 64.9605, + "max_sentence1_length": 153, + "unique_sentence1": 1954, + "min_sentence2_length": 2, + "avg_sentence2_length": 65.114, + "max_sentence2_length": 159, + "unique_sentence2": 1969, "unique_labels": 2, "labels": { "1": { @@ -107,8 +149,14 @@ "zh": { "num_samples": 2000, "number_of_characters": 173012, - "avg_sentence1_len": 43.232, - "avg_sentence2_len": 43.274, + "min_sentence1_length": 2, + "avg_sentence1_length": 43.232, + "max_sentence1_length": 120, + "unique_sentence1": 1909, + "min_sentence2_length": 2, + "avg_sentence2_length": 43.274, + "max_sentence2_length": 113, + "unique_sentence2": 1909, "unique_labels": 2, "labels": { "1": { @@ -124,8 +172,14 @@ "validation": { "num_samples": 14000, "number_of_characters": 2524625, - "avg_sentence1_len": 90.12585714285714, - "avg_sentence2_len": 90.2045, + "min_sentence1_length": 2, + "avg_sentence1_length": 90.12585714285714, + "max_sentence1_length": 248, + "unique_sentence1": 13357, + "min_sentence2_length": 2, + "avg_sentence2_length": 90.2045, + "max_sentence2_length": 275, + "unique_sentence2": 13397, "unique_labels": 2, "labels": { "1": { @@ -139,8 +193,14 @@ "de": { "num_samples": 2000, "number_of_characters": 467643, - "avg_sentence1_len": 116.82, - "avg_sentence2_len": 117.0015, + "min_sentence1_length": 2, + "avg_sentence1_length": 116.82, + "max_sentence1_length": 248, + "unique_sentence1": 1914, + "min_sentence2_length": 2, + "avg_sentence2_length": 117.0015, + "max_sentence2_length": 275, + "unique_sentence2": 1920, "unique_labels": 2, "labels": { "1": { @@ -154,8 +214,14 @@ "en": { "num_samples": 2000, "number_of_characters": 451931, - "avg_sentence1_len": 113.1075, - "avg_sentence2_len": 112.858, + "min_sentence1_length": 25, + "avg_sentence1_length": 113.1075, + "max_sentence1_length": 213, + "unique_sentence1": 1758, + "min_sentence2_length": 25, + "avg_sentence2_length": 112.858, + "max_sentence2_length": 213, + "unique_sentence2": 1771, "unique_labels": 2, "labels": { "1": { @@ -169,8 +235,14 @@ "es": { "num_samples": 2000, "number_of_characters": 466112, - "avg_sentence1_len": 116.3285, - "avg_sentence2_len": 116.7275, + "min_sentence1_length": 2, + "avg_sentence1_length": 116.3285, + "max_sentence1_length": 240, + "unique_sentence1": 1938, + "min_sentence2_length": 2, + "avg_sentence2_length": 116.7275, + "max_sentence2_length": 241, + "unique_sentence2": 1941, "unique_labels": 2, "labels": { "1": { @@ -184,8 +256,14 @@ "fr": { "num_samples": 2000, "number_of_characters": 478510, - "avg_sentence1_len": 119.5045, - "avg_sentence2_len": 119.7505, + "min_sentence1_length": 2, + "avg_sentence1_length": 119.5045, + "max_sentence1_length": 233, + "unique_sentence1": 1933, + "min_sentence2_length": 2, + "avg_sentence2_length": 119.7505, + "max_sentence2_length": 246, + "unique_sentence2": 1939, "unique_labels": 2, "labels": { "1": { @@ -199,8 +277,14 @@ "ja": { "num_samples": 2000, "number_of_characters": 229655, - "avg_sentence1_len": 57.5105, - "avg_sentence2_len": 57.317, + "min_sentence1_length": 2, + "avg_sentence1_length": 57.5105, + "max_sentence1_length": 126, + "unique_sentence1": 1957, + "min_sentence2_length": 2, + "avg_sentence2_length": 57.317, + "max_sentence2_length": 121, + "unique_sentence2": 1969, "unique_labels": 2, "labels": { "1": { @@ -214,8 +298,14 @@ "ko": { "num_samples": 2000, "number_of_characters": 261355, - "avg_sentence1_len": 65.162, - "avg_sentence2_len": 65.5155, + "min_sentence1_length": 2, + "avg_sentence1_length": 65.162, + "max_sentence1_length": 178, + "unique_sentence1": 1963, + "min_sentence2_length": 2, + "avg_sentence2_length": 65.5155, + "max_sentence2_length": 174, + "unique_sentence2": 1968, "unique_labels": 2, "labels": { "1": { @@ -229,8 +319,14 @@ "zh": { "num_samples": 2000, "number_of_characters": 169419, - "avg_sentence1_len": 42.448, - "avg_sentence2_len": 42.2615, + "min_sentence1_length": 2, + "avg_sentence1_length": 42.448, + "max_sentence1_length": 101, + "unique_sentence1": 1899, + "min_sentence2_length": 2, + "avg_sentence2_length": 42.2615, + "max_sentence2_length": 120, + "unique_sentence2": 1895, "unique_labels": 2, "labels": { "1": { diff --git a/mteb/descriptive_stats/PairClassification/TwitterURLCorpus.json b/mteb/descriptive_stats/PairClassification/TwitterURLCorpus.json index 6ca4a5616..473a765dd 100644 --- a/mteb/descriptive_stats/PairClassification/TwitterURLCorpus.json +++ b/mteb/descriptive_stats/PairClassification/TwitterURLCorpus.json @@ -2,8 +2,14 @@ "test": { "num_samples": 51534, "number_of_characters": 8659940, - "avg_sentence1_len": 79.48919160166103, - "avg_sentence2_len": 88.5540419916948, + "min_sentence1_length": 24, + "avg_sentence1_length": 79.48919160166103, + "max_sentence1_length": 126, + "unique_sentence1": 4329, + "min_sentence2_length": 6, + "avg_sentence2_length": 88.5540419916948, + "max_sentence2_length": 608, + "unique_sentence2": 41304, "unique_labels": 2, "labels": { "0": { diff --git a/mteb/descriptive_stats/PairClassification/XNLI.json b/mteb/descriptive_stats/PairClassification/XNLI.json index 91ef22435..867fafdc8 100644 --- a/mteb/descriptive_stats/PairClassification/XNLI.json +++ b/mteb/descriptive_stats/PairClassification/XNLI.json @@ -2,8 +2,14 @@ "test": { "num_samples": 19110, "number_of_characters": 2907145, - "avg_sentence1_len": 103.23793825222397, - "avg_sentence2_len": 48.88895866038723, + "min_sentence1_length": 3, + "avg_sentence1_length": 103.23793825222397, + "max_sentence1_length": 401, + "unique_sentence1": 15328, + "min_sentence2_length": 2, + "avg_sentence2_length": 48.88895866038723, + "max_sentence2_length": 187, + "unique_sentence2": 19104, "unique_labels": 2, "labels": { "0": { @@ -17,8 +23,14 @@ "ar": { "num_samples": 1365, "number_of_characters": 179591, - "avg_sentence1_len": 89.57362637362637, - "avg_sentence2_len": 41.99487179487179, + "min_sentence1_length": 11, + "avg_sentence1_length": 89.57362637362637, + "max_sentence1_length": 242, + "unique_sentence1": 1095, + "min_sentence2_length": 8, + "avg_sentence2_length": 41.99487179487179, + "max_sentence2_length": 115, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -32,8 +44,14 @@ "bg": { "num_samples": 1365, "number_of_characters": 220646, - "avg_sentence1_len": 110.01611721611722, - "avg_sentence2_len": 51.62930402930403, + "min_sentence1_length": 14, + "avg_sentence1_length": 110.01611721611722, + "max_sentence1_length": 303, + "unique_sentence1": 1095, + "min_sentence2_length": 8, + "avg_sentence2_length": 51.62930402930403, + "max_sentence2_length": 150, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -47,8 +65,14 @@ "de": { "num_samples": 1365, "number_of_characters": 241224, - "avg_sentence1_len": 119.92600732600732, - "avg_sentence2_len": 56.794871794871796, + "min_sentence1_length": 3, + "avg_sentence1_length": 119.92600732600732, + "max_sentence1_length": 301, + "unique_sentence1": 1095, + "min_sentence2_length": 9, + "avg_sentence2_length": 56.794871794871796, + "max_sentence2_length": 187, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -62,8 +86,14 @@ "el": { "num_samples": 1365, "number_of_characters": 240222, - "avg_sentence1_len": 119.05421245421246, - "avg_sentence2_len": 56.93260073260073, + "min_sentence1_length": 13, + "avg_sentence1_length": 119.05421245421246, + "max_sentence1_length": 344, + "unique_sentence1": 1095, + "min_sentence2_length": 13, + "avg_sentence2_length": 56.93260073260073, + "max_sentence2_length": 172, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -77,8 +107,14 @@ "en": { "num_samples": 1365, "number_of_characters": 212223, - "avg_sentence1_len": 105.67032967032966, - "avg_sentence2_len": 49.8043956043956, + "min_sentence1_length": 19, + "avg_sentence1_length": 105.67032967032966, + "max_sentence1_length": 268, + "unique_sentence1": 1095, + "min_sentence2_length": 9, + "avg_sentence2_length": 49.8043956043956, + "max_sentence2_length": 137, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -92,8 +128,14 @@ "es": { "num_samples": 1365, "number_of_characters": 232207, - "avg_sentence1_len": 115.43296703296703, - "avg_sentence2_len": 54.68205128205128, + "min_sentence1_length": 11, + "avg_sentence1_length": 115.43296703296703, + "max_sentence1_length": 385, + "unique_sentence1": 1094, + "min_sentence2_length": 8, + "avg_sentence2_length": 54.68205128205128, + "max_sentence2_length": 163, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -107,8 +149,14 @@ "fr": { "num_samples": 1365, "number_of_characters": 245259, - "avg_sentence1_len": 121.0967032967033, - "avg_sentence2_len": 58.58021978021978, + "min_sentence1_length": 9, + "avg_sentence1_length": 121.0967032967033, + "max_sentence1_length": 327, + "unique_sentence1": 1095, + "min_sentence2_length": 10, + "avg_sentence2_length": 58.58021978021978, + "max_sentence2_length": 169, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -122,8 +170,14 @@ "hi": { "num_samples": 1365, "number_of_characters": 211312, - "avg_sentence1_len": 104.63443223443224, - "avg_sentence2_len": 50.17289377289377, + "min_sentence1_length": 16, + "avg_sentence1_length": 104.63443223443224, + "max_sentence1_length": 401, + "unique_sentence1": 1095, + "min_sentence2_length": 9, + "avg_sentence2_length": 50.17289377289377, + "max_sentence2_length": 162, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -137,8 +191,14 @@ "ru": { "num_samples": 1365, "number_of_characters": 222797, - "avg_sentence1_len": 110.76923076923077, - "avg_sentence2_len": 52.452014652014654, + "min_sentence1_length": 11, + "avg_sentence1_length": 110.76923076923077, + "max_sentence1_length": 306, + "unique_sentence1": 1095, + "min_sentence2_length": 8, + "avg_sentence2_length": 52.452014652014654, + "max_sentence2_length": 167, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -152,8 +212,14 @@ "sw": { "num_samples": 1365, "number_of_characters": 210103, - "avg_sentence1_len": 104.43956043956044, - "avg_sentence2_len": 49.48205128205128, + "min_sentence1_length": 10, + "avg_sentence1_length": 104.43956043956044, + "max_sentence1_length": 266, + "unique_sentence1": 1094, + "min_sentence2_length": 2, + "avg_sentence2_length": 49.48205128205128, + "max_sentence2_length": 146, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -167,8 +233,14 @@ "th": { "num_samples": 1365, "number_of_characters": 192788, - "avg_sentence1_len": 96.6923076923077, - "avg_sentence2_len": 44.544322344322346, + "min_sentence1_length": 12, + "avg_sentence1_length": 96.6923076923077, + "max_sentence1_length": 262, + "unique_sentence1": 1095, + "min_sentence2_length": 6, + "avg_sentence2_length": 44.544322344322346, + "max_sentence2_length": 129, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -182,8 +254,14 @@ "tr": { "num_samples": 1365, "number_of_characters": 208658, - "avg_sentence1_len": 103.67765567765568, - "avg_sentence2_len": 49.18534798534799, + "min_sentence1_length": 15, + "avg_sentence1_length": 103.67765567765568, + "max_sentence1_length": 255, + "unique_sentence1": 1095, + "min_sentence2_length": 6, + "avg_sentence2_length": 49.18534798534799, + "max_sentence2_length": 140, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -197,8 +275,14 @@ "vi": { "num_samples": 1365, "number_of_characters": 223549, - "avg_sentence1_len": 111.31208791208792, - "avg_sentence2_len": 52.46007326007326, + "min_sentence1_length": 14, + "avg_sentence1_length": 111.31208791208792, + "max_sentence1_length": 265, + "unique_sentence1": 1095, + "min_sentence2_length": 9, + "avg_sentence2_length": 52.46007326007326, + "max_sentence2_length": 143, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -212,8 +296,14 @@ "zh": { "num_samples": 1365, "number_of_characters": 66566, - "avg_sentence1_len": 33.03589743589744, - "avg_sentence2_len": 15.73040293040293, + "min_sentence1_length": 4, + "avg_sentence1_length": 33.03589743589744, + "max_sentence1_length": 112, + "unique_sentence1": 1095, + "min_sentence2_length": 3, + "avg_sentence2_length": 15.73040293040293, + "max_sentence2_length": 59, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -229,8 +319,14 @@ "validation": { "num_samples": 19110, "number_of_characters": 2909058, - "avg_sentence1_len": 103.20790162218734, - "avg_sentence2_len": 49.01909994767138, + "min_sentence1_length": 5, + "avg_sentence1_length": 103.20790162218734, + "max_sentence1_length": 323, + "unique_sentence1": 11171, + "min_sentence2_length": 3, + "avg_sentence2_length": 49.01909994767138, + "max_sentence2_length": 172, + "unique_sentence2": 19101, "unique_labels": 2, "labels": { "0": { @@ -244,8 +340,14 @@ "ar": { "num_samples": 1365, "number_of_characters": 177355, - "avg_sentence1_len": 88.31868131868131, - "avg_sentence2_len": 41.61172161172161, + "min_sentence1_length": 13, + "avg_sentence1_length": 88.31868131868131, + "max_sentence1_length": 214, + "unique_sentence1": 798, + "min_sentence2_length": 6, + "avg_sentence2_length": 41.61172161172161, + "max_sentence2_length": 137, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -259,8 +361,14 @@ "bg": { "num_samples": 1365, "number_of_characters": 219988, - "avg_sentence1_len": 109.196336996337, - "avg_sentence2_len": 51.967032967032964, + "min_sentence1_length": 16, + "avg_sentence1_length": 109.196336996337, + "max_sentence1_length": 316, + "unique_sentence1": 798, + "min_sentence2_length": 10, + "avg_sentence2_length": 51.967032967032964, + "max_sentence2_length": 151, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -274,8 +382,14 @@ "de": { "num_samples": 1365, "number_of_characters": 241852, - "avg_sentence1_len": 119.81172161172161, - "avg_sentence2_len": 57.36923076923077, + "min_sentence1_length": 20, + "avg_sentence1_length": 119.81172161172161, + "max_sentence1_length": 298, + "unique_sentence1": 798, + "min_sentence2_length": 12, + "avg_sentence2_length": 57.36923076923077, + "max_sentence2_length": 162, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -289,8 +403,14 @@ "el": { "num_samples": 1365, "number_of_characters": 241275, - "avg_sentence1_len": 119.87545787545787, - "avg_sentence2_len": 56.88278388278388, + "min_sentence1_length": 16, + "avg_sentence1_length": 119.87545787545787, + "max_sentence1_length": 302, + "unique_sentence1": 798, + "min_sentence2_length": 6, + "avg_sentence2_length": 56.88278388278388, + "max_sentence2_length": 171, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -304,8 +424,14 @@ "en": { "num_samples": 1365, "number_of_characters": 212384, - "avg_sentence1_len": 105.71648351648352, - "avg_sentence2_len": 49.87619047619047, + "min_sentence1_length": 20, + "avg_sentence1_length": 105.71648351648352, + "max_sentence1_length": 271, + "unique_sentence1": 798, + "min_sentence2_length": 8, + "avg_sentence2_length": 49.87619047619047, + "max_sentence2_length": 139, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -319,8 +445,14 @@ "es": { "num_samples": 1365, "number_of_characters": 232451, - "avg_sentence1_len": 115.17289377289377, - "avg_sentence2_len": 55.120879120879124, + "min_sentence1_length": 14, + "avg_sentence1_length": 115.17289377289377, + "max_sentence1_length": 265, + "unique_sentence1": 798, + "min_sentence2_length": 7, + "avg_sentence2_length": 55.120879120879124, + "max_sentence2_length": 148, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -334,8 +466,14 @@ "fr": { "num_samples": 1365, "number_of_characters": 246857, - "avg_sentence1_len": 121.75897435897436, - "avg_sentence2_len": 59.08864468864469, + "min_sentence1_length": 19, + "avg_sentence1_length": 121.75897435897436, + "max_sentence1_length": 323, + "unique_sentence1": 798, + "min_sentence2_length": 11, + "avg_sentence2_length": 59.08864468864469, + "max_sentence2_length": 172, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -349,8 +487,14 @@ "hi": { "num_samples": 1365, "number_of_characters": 212269, - "avg_sentence1_len": 105.06446886446886, - "avg_sentence2_len": 50.44395604395604, + "min_sentence1_length": 18, + "avg_sentence1_length": 105.06446886446886, + "max_sentence1_length": 277, + "unique_sentence1": 798, + "min_sentence2_length": 7, + "avg_sentence2_length": 50.44395604395604, + "max_sentence2_length": 152, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -364,8 +508,14 @@ "ru": { "num_samples": 1365, "number_of_characters": 221152, - "avg_sentence1_len": 109.74725274725274, - "avg_sentence2_len": 52.26886446886447, + "min_sentence1_length": 15, + "avg_sentence1_length": 109.74725274725274, + "max_sentence1_length": 310, + "unique_sentence1": 798, + "min_sentence2_length": 8, + "avg_sentence2_length": 52.26886446886447, + "max_sentence2_length": 140, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -379,8 +529,14 @@ "sw": { "num_samples": 1365, "number_of_characters": 210482, - "avg_sentence1_len": 104.32234432234432, - "avg_sentence2_len": 49.87692307692308, + "min_sentence1_length": 13, + "avg_sentence1_length": 104.32234432234432, + "max_sentence1_length": 264, + "unique_sentence1": 798, + "min_sentence2_length": 8, + "avg_sentence2_length": 49.87692307692308, + "max_sentence2_length": 153, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -394,8 +550,14 @@ "th": { "num_samples": 1365, "number_of_characters": 192640, - "avg_sentence1_len": 97.28498168498169, - "avg_sentence2_len": 43.843223443223444, + "min_sentence1_length": 7, + "avg_sentence1_length": 97.28498168498169, + "max_sentence1_length": 255, + "unique_sentence1": 798, + "min_sentence2_length": 3, + "avg_sentence2_length": 43.843223443223444, + "max_sentence2_length": 140, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -409,8 +571,14 @@ "tr": { "num_samples": 1365, "number_of_characters": 208305, - "avg_sentence1_len": 102.96630036630036, - "avg_sentence2_len": 49.63809523809524, + "min_sentence1_length": 15, + "avg_sentence1_length": 102.96630036630036, + "max_sentence1_length": 269, + "unique_sentence1": 798, + "min_sentence2_length": 10, + "avg_sentence2_length": 49.63809523809524, + "max_sentence2_length": 139, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -424,8 +592,14 @@ "vi": { "num_samples": 1365, "number_of_characters": 224811, - "avg_sentence1_len": 112.26373626373626, - "avg_sentence2_len": 52.432967032967035, + "min_sentence1_length": 18, + "avg_sentence1_length": 112.26373626373626, + "max_sentence1_length": 323, + "unique_sentence1": 798, + "min_sentence2_length": 9, + "avg_sentence2_length": 52.432967032967035, + "max_sentence2_length": 159, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { @@ -439,8 +613,14 @@ "zh": { "num_samples": 1365, "number_of_characters": 67237, - "avg_sentence1_len": 33.41098901098901, - "avg_sentence2_len": 15.846886446886447, + "min_sentence1_length": 5, + "avg_sentence1_length": 33.41098901098901, + "max_sentence1_length": 135, + "unique_sentence1": 798, + "min_sentence2_length": 3, + "avg_sentence2_length": 15.846886446886447, + "max_sentence2_length": 66, + "unique_sentence2": 1365, "unique_labels": 2, "labels": { "0": { diff --git a/mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json b/mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json index c12f4f292..a0ced7def 100644 --- a/mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json +++ b/mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json @@ -4,8 +4,17 @@ "number_of_characters": 413674, "num_positive": 2255, "num_negative": 5245, - "avg_query_len": 50.205333333333336, - "avg_positive_len": 52.54013303769401, - "avg_negative_len": 52.69189704480458 + "min_query_length": 17, + "avg_query_length": 50.205333333333336, + "max_query_length": 148, + "unique_query": 374, + "min_positive_length": 15, + "avg_positive_length": 52.54013303769401, + "max_positive_length": 152, + "unique_positive": 2165, + "min_negative_length": 15, + "avg_negative_length": 52.69189704480458, + "max_negative_length": 148, + "unique_negative": 5002 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Reranking/ESCIReranking.json b/mteb/descriptive_stats/Reranking/ESCIReranking.json index 419b228eb..9c9556be9 100644 --- a/mteb/descriptive_stats/Reranking/ESCIReranking.json +++ b/mteb/descriptive_stats/Reranking/ESCIReranking.json @@ -4,36 +4,72 @@ "number_of_characters": 254538331, "num_positive": 271416, "num_negative": 44235, - "avg_query_len": 19.691890046098685, - "avg_positive_len": 803.9230995961918, - "avg_negative_len": 808.501458121397, + "min_query_length": 1, + "avg_query_length": 19.691890046098685, + "max_query_length": 151, + "unique_query": 29269, + "min_positive_length": 1, + "avg_positive_length": 803.9230995961918, + "max_positive_length": 8640, + "unique_positive": 217712, + "min_negative_length": 1, + "avg_negative_length": 808.501458121397, + "max_negative_length": 4441, + "unique_negative": 39551, "hf_subset_descriptive_stats": { "us": { "num_samples": 21296, "number_of_characters": 186915609, "num_positive": 189375, "num_negative": 25463, - "avg_query_len": 21.440833959429, - "avg_positive_len": 868.3698006600661, - "avg_negative_len": 864.4493578918431 + "min_query_length": 1, + "avg_query_length": 21.440833959429, + "max_query_length": 151, + "unique_query": 21296, + "min_positive_length": 1, + "avg_positive_length": 868.3698006600661, + "max_positive_length": 5545, + "unique_positive": 150734, + "min_negative_length": 1, + "avg_negative_length": 864.4493578918431, + "max_negative_length": 3779, + "unique_negative": 23073 }, "es": { "num_samples": 3703, "number_of_characters": 48861389, "num_positive": 39110, "num_negative": 10183, - "avg_query_len": 20.681609505806104, - "avg_positive_len": 980.9613142418818, - "avg_negative_len": 1023.2159481488756 + "min_query_length": 3, + "avg_query_length": 20.681609505806104, + "max_query_length": 59, + "unique_query": 3703, + "min_positive_length": 1, + "avg_positive_length": 980.9613142418818, + "max_positive_length": 8640, + "unique_positive": 32921, + "min_negative_length": 1, + "avg_negative_length": 1023.2159481488756, + "max_negative_length": 4441, + "unique_negative": 9285 }, "jp": { "num_samples": 4286, "number_of_characters": 18761333, "num_positive": 42931, "num_negative": 8589, - "avg_query_len": 10.146756882874476, - "avg_positive_len": 358.35792317905475, - "avg_negative_len": 388.075445337059 + "min_query_length": 1, + "avg_query_length": 10.146756882874476, + "max_query_length": 60, + "unique_query": 4286, + "min_positive_length": 1, + "avg_positive_length": 358.35792317905475, + "max_positive_length": 3488, + "unique_positive": 35165, + "min_negative_length": 1, + "avg_negative_length": 388.075445337059, + "max_negative_length": 3940, + "unique_negative": 7289 } } } diff --git a/mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json b/mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json index 1c5fe0f03..0506ff39e 100644 --- a/mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json +++ b/mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json @@ -4,153 +4,306 @@ "number_of_characters": 83866932, "num_positive": 24000, "num_negative": 192000, - "avg_query_len": 59.091208333333334, - "avg_positive_len": 385.45120833333334, - "avg_negative_len": 381.23913541666667, + "min_query_length": 7, + "avg_query_length": 59.091208333333334, + "max_query_length": 180, + "unique_query": 23997, + "min_positive_length": 100, + "avg_positive_length": 385.45120833333334, + "max_positive_length": 3515, + "unique_positive": 23993, + "min_negative_length": 100, + "avg_negative_length": 381.23913541666667, + "max_negative_length": 9461, + "unique_negative": 191783, "hf_subset_descriptive_stats": { "bg": { "num_samples": 1500, "number_of_characters": 5145316, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 60.82666666666667, - "avg_positive_len": 375.88866666666667, - "avg_negative_len": 374.18691666666666 + "min_query_length": 18, + "avg_query_length": 60.82666666666667, + "max_query_length": 166, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 375.88866666666667, + "max_positive_length": 2241, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 374.18691666666666, + "max_negative_length": 4869, + "unique_negative": 11996 }, "bn": { "num_samples": 1500, "number_of_characters": 5390581, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 47.266666666666666, - "avg_positive_len": 394.5946666666667, - "avg_negative_len": 393.98241666666667 + "min_query_length": 7, + "avg_query_length": 47.266666666666666, + "max_query_length": 123, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 394.5946666666667, + "max_positive_length": 2338, + "unique_positive": 1499, + "min_negative_length": 100, + "avg_negative_length": 393.98241666666667, + "max_negative_length": 5104, + "unique_negative": 11996 }, "cs": { "num_samples": 1500, "number_of_characters": 5079180, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 56.272, - "avg_positive_len": 383.8446666666667, - "avg_negative_len": 368.2504166666667 + "min_query_length": 17, + "avg_query_length": 56.272, + "max_query_length": 137, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 383.8446666666667, + "max_positive_length": 2300, + "unique_positive": 1499, + "min_negative_length": 100, + "avg_negative_length": 368.2504166666667, + "max_negative_length": 3487, + "unique_negative": 11982 }, "da": { "num_samples": 1500, "number_of_characters": 4746132, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 56.75066666666667, - "avg_positive_len": 351.6813333333333, - "avg_negative_len": 344.457 + "min_query_length": 17, + "avg_query_length": 56.75066666666667, + "max_query_length": 137, + "unique_query": 1499, + "min_positive_length": 100, + "avg_positive_length": 351.6813333333333, + "max_positive_length": 2159, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 344.457, + "max_negative_length": 2563, + "unique_negative": 11972 }, "de": { "num_samples": 1500, "number_of_characters": 5483592, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 70.004, - "avg_positive_len": 391.5366666666667, - "avg_negative_len": 399.27341666666666 + "min_query_length": 20, + "avg_query_length": 70.004, + "max_query_length": 180, + "unique_query": 1499, + "min_positive_length": 100, + "avg_positive_length": 391.5366666666667, + "max_positive_length": 2674, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 399.27341666666666, + "max_negative_length": 3083, + "unique_negative": 12000 }, "en": { "num_samples": 1500, "number_of_characters": 6217884, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 68.372, - "avg_positive_len": 451.72866666666664, - "avg_negative_len": 453.14441666666664 + "min_query_length": 18, + "avg_query_length": 68.372, + "max_query_length": 162, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 451.72866666666664, + "max_positive_length": 3515, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 453.14441666666664, + "max_negative_length": 3662, + "unique_negative": 12000 }, "fa": { "num_samples": 1500, "number_of_characters": 4732619, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 48.66733333333333, - "avg_positive_len": 347.704, - "avg_negative_len": 344.8385 + "min_query_length": 12, + "avg_query_length": 48.66733333333333, + "max_query_length": 119, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 347.704, + "max_positive_length": 2571, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 344.8385, + "max_negative_length": 4707, + "unique_negative": 11978 }, "fi": { "num_samples": 1500, "number_of_characters": 5209132, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 55.343333333333334, - "avg_positive_len": 394.7126666666667, - "avg_negative_len": 377.83733333333333 + "min_query_length": 14, + "avg_query_length": 55.343333333333334, + "max_query_length": 132, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 394.7126666666667, + "max_positive_length": 2129, + "unique_positive": 1498, + "min_negative_length": 100, + "avg_negative_length": 377.83733333333333, + "max_negative_length": 2574, + "unique_negative": 11972 }, "hi": { "num_samples": 1500, "number_of_characters": 5620959, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 50.77733333333333, - "avg_positive_len": 420.3786666666667, - "avg_negative_len": 409.51875 + "min_query_length": 13, + "avg_query_length": 50.77733333333333, + "max_query_length": 125, + "unique_query": 1499, + "min_positive_length": 100, + "avg_positive_length": 420.3786666666667, + "max_positive_length": 2361, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 409.51875, + "max_negative_length": 5912, + "unique_negative": 11996 }, "it": { "num_samples": 1500, "number_of_characters": 5420496, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 70.05466666666666, - "avg_positive_len": 396.97333333333336, - "avg_negative_len": 393.3295 + "min_query_length": 23, + "avg_query_length": 70.05466666666666, + "max_query_length": 156, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 396.97333333333336, + "max_positive_length": 2082, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 393.3295, + "max_negative_length": 9461, + "unique_negative": 11993 }, "nl": { "num_samples": 1500, "number_of_characters": 5169556, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 65.34466666666667, - "avg_positive_len": 380.79133333333334, - "avg_negative_len": 375.02933333333334 + "min_query_length": 18, + "avg_query_length": 65.34466666666667, + "max_query_length": 136, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 380.79133333333334, + "max_positive_length": 1864, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 375.02933333333334, + "max_negative_length": 3641, + "unique_negative": 11985 }, "pt": { "num_samples": 1500, "number_of_characters": 5474356, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 65.11933333333333, - "avg_positive_len": 404.01933333333335, - "avg_negative_len": 397.554 + "min_query_length": 18, + "avg_query_length": 65.11933333333333, + "max_query_length": 176, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 404.01933333333335, + "max_positive_length": 3057, + "unique_positive": 1499, + "min_negative_length": 100, + "avg_negative_length": 397.554, + "max_negative_length": 2877, + "unique_negative": 11991 }, "ro": { "num_samples": 1500, "number_of_characters": 4796113, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 61.973333333333336, - "avg_positive_len": 346.70866666666666, - "avg_negative_len": 348.5908333333333 + "min_query_length": 14, + "avg_query_length": 61.973333333333336, + "max_query_length": 169, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 346.70866666666666, + "max_positive_length": 1917, + "unique_positive": 1499, + "min_negative_length": 100, + "avg_negative_length": 348.5908333333333, + "max_negative_length": 4213, + "unique_negative": 11971 }, "sr": { "num_samples": 1500, "number_of_characters": 5271732, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 55.669333333333334, - "avg_positive_len": 386.34933333333333, - "avg_negative_len": 384.0586666666667 + "min_query_length": 15, + "avg_query_length": 55.669333333333334, + "max_query_length": 146, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 386.34933333333333, + "max_positive_length": 2421, + "unique_positive": 1499, + "min_negative_length": 100, + "avg_negative_length": 384.0586666666667, + "max_negative_length": 3668, + "unique_negative": 11974 }, "no": { "num_samples": 1500, "number_of_characters": 5036586, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 55.288, - "avg_positive_len": 367.72, - "avg_negative_len": 366.8395 + "min_query_length": 14, + "avg_query_length": 55.288, + "max_query_length": 129, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 367.72, + "max_positive_length": 1450, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 366.8395, + "max_negative_length": 2841, + "unique_negative": 11996 }, "sv": { "num_samples": 1500, "number_of_characters": 5072698, "num_positive": 1500, "num_negative": 12000, - "avg_query_len": 57.73, - "avg_positive_len": 372.58733333333333, - "avg_negative_len": 368.93516666666665 + "min_query_length": 17, + "avg_query_length": 57.73, + "max_query_length": 133, + "unique_query": 1500, + "min_positive_length": 100, + "avg_positive_length": 372.58733333333333, + "max_positive_length": 2493, + "unique_positive": 1500, + "min_negative_length": 100, + "avg_negative_length": 368.93516666666665, + "max_negative_length": 3680, + "unique_negative": 11999 } } } diff --git a/mteb/descriptive_stats/Retrieval/AppsRetrieval.json b/mteb/descriptive_stats/Retrieval/AppsRetrieval.json index 8a71a1ad1..caaab2453 100644 --- a/mteb/descriptive_stats/Retrieval/AppsRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/AppsRetrieval.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 2245.837090504686, + "number_of_characters": 11335620, "num_samples": 12530, "num_queries": 3765, "num_documents": 8765, - "average_document_length": 0.0657169048317138, - "average_query_length": 0.4435135244766838, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 152, + "average_document_length": 717.2737022247576, + "max_document_length": 5742, + "unique_documents": 8765, + "min_query_length": 6, + "average_query_length": 1340.9604249667996, + "max_query_length": 289049, + "unique_queries": 3765, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 3765 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json b/mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json index fe213d96d..78c8a7e12 100644 --- a/mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json @@ -1,3396 +1,6789 @@ { "test": { - "number_of_characters": 76.49551684802204, + "number_of_characters": 25574620, "num_samples": 521866, "num_queries": 338378, "num_documents": 183488, - "average_document_length": 1.0899895361004534e-05, - "average_query_length": 0.000220154728877238, + "min_document_length": 4, + "average_document_length": 137.38034094872688, + "max_document_length": 237, + "unique_documents": 183488, + "min_query_length": 2, + "average_query_length": 1.0845149507355678, + "max_query_length": 2, + "unique_queries": 338378, + "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0000413738481817, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 183488, "hf_subset_descriptive_stats": { "acm_Arab-acm_Arab": { - "number_of_characters": 57.84, + "number_of_characters": 51232, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06204444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 102.98360655737704, + "max_document_length": 129, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "acm_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-acm_Arab": { - "number_of_characters": 57.84, + "number_of_characters": 51232, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06204444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 102.98360655737704, + "max_document_length": 129, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "afr_Latn-afr_Latn": { - "number_of_characters": 80.04555555555555, + "number_of_characters": 71217, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08671728395061729, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 143.93647540983608, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "afr_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-afr_Latn": { - "number_of_characters": 80.04555555555555, + "number_of_characters": 71217, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08671728395061729, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 143.93647540983608, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "als_Latn-als_Latn": { - "number_of_characters": 78.13555555555556, + "number_of_characters": 69498, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08459506172839507, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 140.4139344262295, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "als_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-als_Latn": { - "number_of_characters": 78.13555555555556, + "number_of_characters": 69498, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08459506172839507, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 140.4139344262295, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "amh_Ethi-amh_Ethi": { - "number_of_characters": 51.16111111111111, + "number_of_characters": 45221, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.05462345679012346, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 90.66598360655738, + "max_document_length": 100, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "amh_Ethi-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-amh_Ethi": { - "number_of_characters": 51.16111111111111, + "number_of_characters": 45221, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.05462345679012346, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 90.66598360655738, + "max_document_length": 100, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "apc_Arab-apc_Arab": { - "number_of_characters": 57.85777777777778, + "number_of_characters": 51248, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.062064197530864194, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 103.01639344262296, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "apc_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-apc_Arab": { - "number_of_characters": 57.85777777777778, + "number_of_characters": 51248, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.062064197530864194, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 103.01639344262296, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Arab-arb_Arab": { - "number_of_characters": 60.55, + "number_of_characters": 53671, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06505555555555555, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 107.98155737704919, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-arb_Arab": { - "number_of_characters": 60.55, + "number_of_characters": 53671, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06505555555555555, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 107.98155737704919, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Latn-arb_Latn": { - "number_of_characters": 69.02444444444444, + "number_of_characters": 61298, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0744716049382716, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 123.61065573770492, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-arb_Latn": { - "number_of_characters": 69.02444444444444, + "number_of_characters": 61298, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0744716049382716, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 123.61065573770492, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ars_Arab-ars_Arab": { - "number_of_characters": 58.43222222222222, + "number_of_characters": 51765, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06270246913580246, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 104.07581967213115, + "max_document_length": 119, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ars_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ars_Arab": { - "number_of_characters": 58.43222222222222, + "number_of_characters": 51765, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06270246913580246, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 104.07581967213115, + "max_document_length": 119, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ary_Arab-ary_Arab": { - "number_of_characters": 68.01893095768374, + "number_of_characters": 60261, "num_samples": 1386, "num_queries": 898, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07351774048739837, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 121.48565573770492, + "max_document_length": 138, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.086859688195991, + "max_query_length": 2, + "unique_queries": 898, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ary_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ary_Arab": { - "number_of_characters": 68.01893095768374, + "number_of_characters": 60261, "num_samples": 1386, "num_queries": 898, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07351774048739837, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 121.48565573770492, + "max_document_length": 138, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.086859688195991, + "max_query_length": 2, + "unique_queries": 898, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arz_Arab-arz_Arab": { - "number_of_characters": 59.14111111111111, + "number_of_characters": 52403, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06349012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 105.38319672131148, + "max_document_length": 115, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arz_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-arz_Arab": { - "number_of_characters": 59.14111111111111, + "number_of_characters": 52403, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06349012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 105.38319672131148, + "max_document_length": 115, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "asm_Beng-asm_Beng": { - "number_of_characters": 70.26, + "number_of_characters": 62410, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07584444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 4, + "average_document_length": 125.88934426229508, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "asm_Beng-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-asm_Beng": { - "number_of_characters": 70.26, + "number_of_characters": 62410, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07584444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 4, + "average_document_length": 125.88934426229508, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "azj_Latn-azj_Latn": { - "number_of_characters": 75.51222222222222, + "number_of_characters": 67137, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08168024691358025, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 135.57581967213116, + "max_document_length": 156, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "azj_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-azj_Latn": { - "number_of_characters": 75.51222222222222, + "number_of_characters": 67137, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08168024691358025, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 135.57581967213116, + "max_document_length": 156, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bam_Latn-bam_Latn": { - "number_of_characters": 74.34222222222222, + "number_of_characters": 66084, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08038024691358024, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 133.41803278688525, + "max_document_length": 166, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bam_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-bam_Latn": { - "number_of_characters": 74.34222222222222, + "number_of_characters": 66084, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08038024691358024, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 133.41803278688525, + "max_document_length": 166, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Beng-ben_Beng": { - "number_of_characters": 71.48444444444445, + "number_of_characters": 63512, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07720493827160495, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 128.14754098360655, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Beng-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ben_Beng": { - "number_of_characters": 71.48444444444445, + "number_of_characters": 63512, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07720493827160495, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 128.14754098360655, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Latn-ben_Latn": { - "number_of_characters": 76.78777777777778, + "number_of_characters": 68285, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08309753086419754, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 137.92827868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ben_Latn": { - "number_of_characters": 76.78777777777778, + "number_of_characters": 68285, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08309753086419754, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 137.92827868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bod_Tibt-bod_Tibt": { - "number_of_characters": 88.90222222222222, + "number_of_characters": 79188, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09655802469135802, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 160.2704918032787, + "max_document_length": 213, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bod_Tibt-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-bod_Tibt": { - "number_of_characters": 88.90222222222222, + "number_of_characters": 79188, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09655802469135802, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 160.2704918032787, + "max_document_length": 213, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bul_Cyrl-bul_Cyrl": { - "number_of_characters": 74.89, + "number_of_characters": 66577, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08098888888888889, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 134.42827868852459, + "max_document_length": 177, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "bul_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-bul_Cyrl": { - "number_of_characters": 74.89, + "number_of_characters": 66577, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08098888888888889, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 134.42827868852459, + "max_document_length": 177, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "cat_Latn-cat_Latn": { - "number_of_characters": 77.40666666666667, + "number_of_characters": 68842, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08378518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 139.06967213114754, + "max_document_length": 163, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "cat_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-cat_Latn": { - "number_of_characters": 77.40666666666667, + "number_of_characters": 68842, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08378518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 139.06967213114754, + "max_document_length": 163, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ceb_Latn-ceb_Latn": { - "number_of_characters": 83.19666666666667, + "number_of_characters": 74053, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09021851851851853, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 149.74795081967213, + "max_document_length": 184, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ceb_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ceb_Latn": { - "number_of_characters": 83.19666666666667, + "number_of_characters": 74053, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09021851851851853, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 149.74795081967213, + "max_document_length": 184, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ces_Latn-ces_Latn": { - "number_of_characters": 69.73333333333333, + "number_of_characters": 61936, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07525925925925926, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 124.91803278688525, + "max_document_length": 139, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ces_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ces_Latn": { - "number_of_characters": 69.73333333333333, + "number_of_characters": 61936, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07525925925925926, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 124.91803278688525, + "max_document_length": 139, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ckb_Arab-ckb_Arab": { - "number_of_characters": 73.04555555555555, + "number_of_characters": 64917, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0789395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 131.0266393442623, + "max_document_length": 178, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ckb_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ckb_Arab": { - "number_of_characters": 73.04555555555555, + "number_of_characters": 64917, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0789395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 131.0266393442623, + "max_document_length": 178, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "dan_Latn-dan_Latn": { - "number_of_characters": 74.96888888888888, + "number_of_characters": 66648, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08107654320987653, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 134.5737704918033, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "dan_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-dan_Latn": { - "number_of_characters": 74.96888888888888, + "number_of_characters": 66648, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08107654320987653, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 134.5737704918033, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "deu_Latn-deu_Latn": { - "number_of_characters": 77.32444444444444, + "number_of_characters": 68768, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08369382716049382, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 138.91803278688525, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "deu_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-deu_Latn": { - "number_of_characters": 77.32444444444444, + "number_of_characters": 68768, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08369382716049382, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 138.91803278688525, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ell_Grek-ell_Grek": { - "number_of_characters": 88.92666666666666, + "number_of_characters": 79210, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09658518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 160.3155737704918, + "max_document_length": 212, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ell_Grek-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ell_Grek": { - "number_of_characters": 88.92666666666666, + "number_of_characters": 79210, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09658518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 160.3155737704918, + "max_document_length": 212, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "est_Latn-est_Latn": { - "number_of_characters": 69.55888888888889, + "number_of_characters": 61779, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07506543209876543, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 124.59631147540983, + "max_document_length": 164, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "est_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-est_Latn": { - "number_of_characters": 69.55888888888889, + "number_of_characters": 61779, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07506543209876543, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 124.59631147540983, + "max_document_length": 164, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eus_Latn-eus_Latn": { - "number_of_characters": 76.44777777777777, + "number_of_characters": 67979, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08271975308641975, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 137.3012295081967, + "max_document_length": 169, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eus_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-eus_Latn": { - "number_of_characters": 76.44777777777777, + "number_of_characters": 67979, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08271975308641975, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 137.3012295081967, + "max_document_length": 169, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fin_Latn-fin_Latn": { - "number_of_characters": 74.50888888888889, + "number_of_characters": 66234, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08056543209876543, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.72540983606558, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fin_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-fin_Latn": { - "number_of_characters": 74.50888888888889, + "number_of_characters": 66234, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08056543209876543, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.72540983606558, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fra_Latn-fra_Latn": { - "number_of_characters": 92.54222222222222, + "number_of_characters": 82464, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10060246913580247, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 166.98360655737704, + "max_document_length": 204, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fra_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-fra_Latn": { - "number_of_characters": 92.54222222222222, + "number_of_characters": 82464, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10060246913580247, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 166.98360655737704, + "max_document_length": 204, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fuv_Latn-fuv_Latn": { - "number_of_characters": 60.42111111111111, + "number_of_characters": 53555, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06491234567901234, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 107.7438524590164, + "max_document_length": 122, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "fuv_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-fuv_Latn": { - "number_of_characters": 60.42111111111111, + "number_of_characters": 53555, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06491234567901234, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 107.7438524590164, + "max_document_length": 122, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "gaz_Latn-gaz_Latn": { - "number_of_characters": 87.93222222222222, + "number_of_characters": 78315, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09548024691358024, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 158.48155737704917, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "gaz_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-gaz_Latn": { - "number_of_characters": 87.93222222222222, + "number_of_characters": 78315, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09548024691358024, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 158.48155737704917, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "grn_Latn-grn_Latn": { - "number_of_characters": 77.10666666666667, + "number_of_characters": 68572, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08345185185185186, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 138.51639344262296, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "grn_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-grn_Latn": { - "number_of_characters": 77.10666666666667, + "number_of_characters": 68572, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08345185185185186, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 138.51639344262296, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "guj_Gujr-guj_Gujr": { - "number_of_characters": 64.25666666666666, + "number_of_characters": 57007, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06917407407407407, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 114.81762295081967, + "max_document_length": 138, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "guj_Gujr-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-guj_Gujr": { - "number_of_characters": 64.25666666666666, + "number_of_characters": 57007, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06917407407407407, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 114.81762295081967, + "max_document_length": 138, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hat_Latn-hat_Latn": { - "number_of_characters": 72.64666666666666, + "number_of_characters": 64558, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07849629629629629, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 130.29098360655738, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hat_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hat_Latn": { - "number_of_characters": 72.64666666666666, + "number_of_characters": 64558, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07849629629629629, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 130.29098360655738, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hau_Latn-hau_Latn": { - "number_of_characters": 87.8488888888889, + "number_of_characters": 78240, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09538765432098766, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 158.327868852459, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hau_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hau_Latn": { - "number_of_characters": 87.8488888888889, + "number_of_characters": 78240, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09538765432098766, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 158.327868852459, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "heb_Hebr-heb_Hebr": { - "number_of_characters": 57.135555555555555, + "number_of_characters": 50598, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06126172839506173, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 101.68442622950819, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "heb_Hebr-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-heb_Hebr": { - "number_of_characters": 57.135555555555555, + "number_of_characters": 50598, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06126172839506173, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 101.68442622950819, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Deva-hin_Deva": { - "number_of_characters": 74.61777777777777, + "number_of_characters": 66332, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08068641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.9262295081967, + "max_document_length": 165, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Deva-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hin_Deva": { - "number_of_characters": 74.61777777777777, + "number_of_characters": 66332, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08068641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.9262295081967, + "max_document_length": 165, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Latn-hin_Latn": { - "number_of_characters": 76.81222222222222, + "number_of_characters": 68307, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08312469135802468, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 137.9733606557377, + "max_document_length": 170, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hin_Latn": { - "number_of_characters": 76.81222222222222, + "number_of_characters": 68307, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08312469135802468, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 137.9733606557377, + "max_document_length": 170, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hrv_Latn-hrv_Latn": { - "number_of_characters": 70.83555555555556, + "number_of_characters": 62928, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07648395061728396, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 126.95081967213115, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hrv_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hrv_Latn": { - "number_of_characters": 70.83555555555556, + "number_of_characters": 62928, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07648395061728396, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 126.95081967213115, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hun_Latn-hun_Latn": { - "number_of_characters": 76.40555555555555, + "number_of_characters": 67941, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08267283950617284, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 137.2233606557377, + "max_document_length": 176, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hun_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hun_Latn": { - "number_of_characters": 76.40555555555555, + "number_of_characters": 67941, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08267283950617284, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 137.2233606557377, + "max_document_length": 176, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hye_Armn-hye_Armn": { - "number_of_characters": 77.42555555555556, + "number_of_characters": 68859, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08380617283950619, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 139.1045081967213, + "max_document_length": 193, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hye_Armn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-hye_Armn": { - "number_of_characters": 77.42555555555556, + "number_of_characters": 68859, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08380617283950619, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 139.1045081967213, + "max_document_length": 193, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ibo_Latn-ibo_Latn": { - "number_of_characters": 74.51501668520578, + "number_of_characters": 66167, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08066186505584626, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 19, + "average_document_length": 133.58811475409837, + "max_document_length": 156, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "ibo_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ibo_Latn": { - "number_of_characters": 74.51501668520578, + "number_of_characters": 66167, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08066186505584626, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 19, + "average_document_length": 133.58811475409837, + "max_document_length": 156, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "ilo_Latn-ilo_Latn": { - "number_of_characters": 87.7611111111111, + "number_of_characters": 78161, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09529012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 158.16598360655738, + "max_document_length": 187, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ilo_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ilo_Latn": { - "number_of_characters": 87.7611111111111, + "number_of_characters": 78161, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09529012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 158.16598360655738, + "max_document_length": 187, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ind_Latn-ind_Latn": { - "number_of_characters": 84.10555555555555, + "number_of_characters": 74871, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09122839506172839, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 151.42418032786884, + "max_document_length": 207, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ind_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ind_Latn": { - "number_of_characters": 84.10555555555555, + "number_of_characters": 74871, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09122839506172839, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 151.42418032786884, + "max_document_length": 207, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "isl_Latn-isl_Latn": { - "number_of_characters": 79.27333333333333, + "number_of_characters": 70522, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08585925925925925, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 142.5122950819672, + "max_document_length": 170, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "isl_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-isl_Latn": { - "number_of_characters": 79.27333333333333, + "number_of_characters": 70522, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08585925925925925, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 142.5122950819672, + "max_document_length": 170, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ita_Latn-ita_Latn": { - "number_of_characters": 85.49777777777778, + "number_of_characters": 76124, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09277530864197532, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 153.99180327868854, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ita_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ita_Latn": { - "number_of_characters": 85.49777777777778, + "number_of_characters": 76124, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09277530864197532, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 153.99180327868854, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "jav_Latn-jav_Latn": { - "number_of_characters": 80.60666666666667, + "number_of_characters": 71722, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08734074074074075, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 144.97131147540983, + "max_document_length": 174, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "jav_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-jav_Latn": { - "number_of_characters": 80.60666666666667, + "number_of_characters": 71722, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08734074074074075, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 144.97131147540983, + "max_document_length": 174, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "jpn_Jpan-jpn_Jpan": { - "number_of_characters": 37.79, + "number_of_characters": 33187, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.039766666666666665, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 66.0061475409836, + "max_document_length": 76, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "jpn_Jpan-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-jpn_Jpan": { - "number_of_characters": 37.79, + "number_of_characters": 33187, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.039766666666666665, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 66.0061475409836, + "max_document_length": 76, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kac_Latn-kac_Latn": { - "number_of_characters": 100.64182424916574, + "number_of_characters": 89655, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10972394243511205, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 181.71926229508196, + "max_document_length": 195, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kac_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kac_Latn": { - "number_of_characters": 100.64182424916574, + "number_of_characters": 89655, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10972394243511205, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 181.71926229508196, + "max_document_length": 195, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kan_Knda-kan_Knda": { - "number_of_characters": 74.13666666666667, + "number_of_characters": 65899, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08015185185185185, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.0389344262295, + "max_document_length": 165, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kan_Knda-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kan_Knda": { - "number_of_characters": 74.13666666666667, + "number_of_characters": 65899, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08015185185185185, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.0389344262295, + "max_document_length": 165, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kat_Geor-kat_Geor": { - "number_of_characters": 76.81444444444445, + "number_of_characters": 68309, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08312716049382717, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 137.97745901639345, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kat_Geor-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kat_Geor": { - "number_of_characters": 76.81444444444445, + "number_of_characters": 68309, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08312716049382717, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 137.97745901639345, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kaz_Cyrl-kaz_Cyrl": { - "number_of_characters": 72.75666666666666, + "number_of_characters": 64657, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07861851851851852, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 130.49385245901638, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kaz_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kaz_Cyrl": { - "number_of_characters": 72.75666666666666, + "number_of_characters": 64657, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07861851851851852, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 130.49385245901638, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kea_Latn-kea_Latn": { - "number_of_characters": 77.94111111111111, + "number_of_characters": 69323, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08437901234567902, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 140.05532786885246, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kea_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kea_Latn": { - "number_of_characters": 77.94111111111111, + "number_of_characters": 69323, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08437901234567902, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 140.05532786885246, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "khk_Cyrl-khk_Cyrl": { - "number_of_characters": 75.33444444444444, + "number_of_characters": 66977, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08148271604938272, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 135.24795081967213, + "max_document_length": 162, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "khk_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-khk_Cyrl": { - "number_of_characters": 75.33444444444444, + "number_of_characters": 66977, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08148271604938272, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 135.24795081967213, + "max_document_length": 162, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "khm_Khmr-khm_Khmr": { - "number_of_characters": 77.74888888888889, + "number_of_characters": 69150, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08416543209876542, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 139.70081967213116, + "max_document_length": 169, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "khm_Khmr-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-khm_Khmr": { - "number_of_characters": 77.74888888888889, + "number_of_characters": 69150, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08416543209876542, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 139.70081967213116, + "max_document_length": 169, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kin_Latn-kin_Latn": { - "number_of_characters": 81.89655172413794, + "number_of_characters": 72803, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08887269379770626, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 13, + "average_document_length": 147.18647540983608, + "max_document_length": 194, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "kin_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kin_Latn": { - "number_of_characters": 81.89655172413794, + "number_of_characters": 72803, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08887269379770626, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 13, + "average_document_length": 147.18647540983608, + "max_document_length": 194, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "kir_Cyrl-kir_Cyrl": { - "number_of_characters": 76.42333333333333, + "number_of_characters": 67957, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0826925925925926, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 137.25614754098362, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kir_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kir_Cyrl": { - "number_of_characters": 76.42333333333333, + "number_of_characters": 67957, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0826925925925926, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 137.25614754098362, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kor_Hang-kor_Hang": { - "number_of_characters": 37.257777777777775, + "number_of_characters": 32708, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.039175308641975305, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 65.02459016393442, + "max_document_length": 88, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "kor_Hang-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-kor_Hang": { - "number_of_characters": 37.257777777777775, + "number_of_characters": 32708, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.039175308641975305, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 65.02459016393442, + "max_document_length": 88, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lao_Laoo-lao_Laoo": { - "number_of_characters": 65.31333333333333, + "number_of_characters": 57958, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07034814814814815, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 116.76639344262296, + "max_document_length": 142, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lao_Laoo-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-lao_Laoo": { - "number_of_characters": 65.31333333333333, + "number_of_characters": 57958, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07034814814814815, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 116.76639344262296, + "max_document_length": 142, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lin_Latn-lin_Latn": { - "number_of_characters": 83.56681514476615, + "number_of_characters": 74223, "num_samples": 1386, "num_queries": 898, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09083164270018503, - "average_relevant_docs_per_query": 1.0022271714922049 + "min_document_length": 17, + "average_document_length": 150.09631147540983, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.086859688195991, + "max_query_length": 2, + "unique_queries": 898, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0022271714922049, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "lin_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-lin_Latn": { - "number_of_characters": 83.56681514476615, + "number_of_characters": 74223, "num_samples": 1386, "num_queries": 898, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09083164270018503, - "average_relevant_docs_per_query": 1.0022271714922049 + "min_document_length": 17, + "average_document_length": 150.09631147540983, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.086859688195991, + "max_query_length": 2, + "unique_queries": 898, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0022271714922049, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "lit_Latn-lit_Latn": { - "number_of_characters": 70.69888888888889, + "number_of_characters": 62805, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0763320987654321, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 126.69877049180327, + "max_document_length": 167, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lit_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-lit_Latn": { - "number_of_characters": 70.69888888888889, + "number_of_characters": 62805, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0763320987654321, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 126.69877049180327, + "max_document_length": 167, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lug_Latn-lug_Latn": { - "number_of_characters": 80.52057842046719, + "number_of_characters": 71566, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08734213394935171, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 12, + "average_document_length": 144.6516393442623, + "max_document_length": 237, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "lug_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-lug_Latn": { - "number_of_characters": 80.52057842046719, + "number_of_characters": 71566, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08734213394935171, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 12, + "average_document_length": 144.6516393442623, + "max_document_length": 237, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "luo_Latn-luo_Latn": { - "number_of_characters": 75.14333333333333, + "number_of_characters": 66805, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08127037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 134.8954918032787, + "max_document_length": 178, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "luo_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-luo_Latn": { - "number_of_characters": 75.14333333333333, + "number_of_characters": 66805, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08127037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 134.8954918032787, + "max_document_length": 178, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lvs_Latn-lvs_Latn": { - "number_of_characters": 71.97888888888889, + "number_of_characters": 63957, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07775432098765432, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 129.0594262295082, + "max_document_length": 172, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "lvs_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-lvs_Latn": { - "number_of_characters": 71.97888888888889, + "number_of_characters": 63957, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07775432098765432, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 129.0594262295082, + "max_document_length": 172, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mal_Mlym-mal_Mlym": { - "number_of_characters": 82.69222222222223, + "number_of_characters": 73599, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08965802469135803, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 148.81762295081967, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mal_Mlym-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mal_Mlym": { - "number_of_characters": 82.69222222222223, + "number_of_characters": 73599, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08965802469135803, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 148.81762295081967, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mar_Deva-mar_Deva": { - "number_of_characters": 70.62625139043382, + "number_of_characters": 62671, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07633620844319669, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 15, + "average_document_length": 126.42418032786885, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "mar_Deva-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mar_Deva": { - "number_of_characters": 70.62625139043382, + "number_of_characters": 62671, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07633620844319669, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 15, + "average_document_length": 126.42418032786885, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "mkd_Cyrl-mkd_Cyrl": { - "number_of_characters": 76.01333333333334, + "number_of_characters": 67588, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08223703703703704, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 136.5, + "max_document_length": 180, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mkd_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mkd_Cyrl": { - "number_of_characters": 76.01333333333334, + "number_of_characters": 67588, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08223703703703704, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 136.5, + "max_document_length": 180, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mlt_Latn-mlt_Latn": { - "number_of_characters": 77.00444444444445, + "number_of_characters": 68480, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08333827160493827, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 138.327868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mlt_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mlt_Latn": { - "number_of_characters": 77.00444444444445, + "number_of_characters": 68480, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08333827160493827, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 138.327868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mri_Latn-mri_Latn": { - "number_of_characters": 83.71444444444444, + "number_of_characters": 74519, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09079382716049382, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 150.702868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mri_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mri_Latn": { - "number_of_characters": 83.71444444444444, + "number_of_characters": 74519, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09079382716049382, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 150.702868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mya_Mymr-mya_Mymr": { - "number_of_characters": 91.28333333333333, + "number_of_characters": 81331, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0992037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 164.66188524590163, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "mya_Mymr-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-mya_Mymr": { - "number_of_characters": 91.28333333333333, + "number_of_characters": 81331, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0992037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 164.66188524590163, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nld_Latn-nld_Latn": { - "number_of_characters": 77.34777777777778, + "number_of_characters": 68789, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08371975308641975, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 138.9610655737705, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nld_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-nld_Latn": { - "number_of_characters": 77.34777777777778, + "number_of_characters": 68789, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08371975308641975, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 138.9610655737705, + "max_document_length": 183, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nob_Latn-nob_Latn": { - "number_of_characters": 73.04555555555555, + "number_of_characters": 64917, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0789395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 131.0266393442623, + "max_document_length": 168, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nob_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-nob_Latn": { - "number_of_characters": 73.04555555555555, + "number_of_characters": 64917, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0789395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 131.0266393442623, + "max_document_length": 168, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Deva-npi_Deva": { - "number_of_characters": 68.89666666666666, + "number_of_characters": 61183, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07432962962962962, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 123.375, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Deva-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-npi_Deva": { - "number_of_characters": 68.89666666666666, + "number_of_characters": 61183, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07432962962962962, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 123.375, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Latn-npi_Latn": { - "number_of_characters": 73.89666666666666, + "number_of_characters": 65683, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07988518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 132.59631147540983, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-npi_Latn": { - "number_of_characters": 73.89666666666666, + "number_of_characters": 65683, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07988518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 132.59631147540983, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nso_Latn-nso_Latn": { - "number_of_characters": 88.77444444444444, + "number_of_characters": 79073, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09641604938271604, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 160.03483606557376, + "max_document_length": 235, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nso_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-nso_Latn": { - "number_of_characters": 88.77444444444444, + "number_of_characters": 79073, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09641604938271604, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 160.03483606557376, + "max_document_length": 235, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nya_Latn-nya_Latn": { - "number_of_characters": 92.78777777777778, + "number_of_characters": 82685, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.1008753086419753, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 167.43647540983608, + "max_document_length": 215, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "nya_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-nya_Latn": { - "number_of_characters": 92.78777777777778, + "number_of_characters": 82685, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.1008753086419753, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 167.43647540983608, + "max_document_length": 215, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ory_Orya-ory_Orya": { - "number_of_characters": 74.95777777777778, + "number_of_characters": 66638, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0810641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 10, + "average_document_length": 134.55327868852459, + "max_document_length": 168, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ory_Orya-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ory_Orya": { - "number_of_characters": 74.95777777777778, + "number_of_characters": 66638, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0810641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 10, + "average_document_length": 134.55327868852459, + "max_document_length": 168, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pan_Guru-pan_Guru": { - "number_of_characters": 75.29777777777778, + "number_of_characters": 66944, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08144197530864197, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 135.18032786885246, + "max_document_length": 157, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pan_Guru-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-pan_Guru": { - "number_of_characters": 75.29777777777778, + "number_of_characters": 66944, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08144197530864197, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 135.18032786885246, + "max_document_length": 157, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pbt_Arab-pbt_Arab": { - "number_of_characters": 69.67111111111112, + "number_of_characters": 61880, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07519012345679013, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 124.80327868852459, + "max_document_length": 155, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pbt_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-pbt_Arab": { - "number_of_characters": 69.67111111111112, + "number_of_characters": 61880, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07519012345679013, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 124.80327868852459, + "max_document_length": 155, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pes_Arab-pes_Arab": { - "number_of_characters": 66.75111111111111, + "number_of_characters": 59252, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07194567901234568, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 119.41803278688525, + "max_document_length": 152, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pes_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-pes_Arab": { - "number_of_characters": 66.75111111111111, + "number_of_characters": 59252, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07194567901234568, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 119.41803278688525, + "max_document_length": 152, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "plt_Latn-plt_Latn": { - "number_of_characters": 96.99555555555555, + "number_of_characters": 86472, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10555061728395061, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 175.19672131147541, + "max_document_length": 222, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "plt_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-plt_Latn": { - "number_of_characters": 96.99555555555555, + "number_of_characters": 86472, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10555061728395061, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 175.19672131147541, + "max_document_length": 222, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pol_Latn-pol_Latn": { - "number_of_characters": 76.09777777777778, + "number_of_characters": 67664, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08233086419753087, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 136.65573770491804, + "max_document_length": 196, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "pol_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-pol_Latn": { - "number_of_characters": 76.09777777777778, + "number_of_characters": 67664, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08233086419753087, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 136.65573770491804, + "max_document_length": 196, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "por_Latn-por_Latn": { - "number_of_characters": 80.11666666666666, + "number_of_characters": 71281, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08679629629629629, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 144.06762295081967, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "por_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-por_Latn": { - "number_of_characters": 80.11666666666666, + "number_of_characters": 71281, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08679629629629629, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 144.06762295081967, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ron_Latn-ron_Latn": { - "number_of_characters": 80.74222222222222, + "number_of_characters": 71844, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08749135802469137, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 145.22131147540983, + "max_document_length": 181, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ron_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ron_Latn": { - "number_of_characters": 80.74222222222222, + "number_of_characters": 71844, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08749135802469137, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 145.22131147540983, + "max_document_length": 181, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "rus_Cyrl-rus_Cyrl": { - "number_of_characters": 85.16333333333333, + "number_of_characters": 75823, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0924037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 153.375, + "max_document_length": 196, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "rus_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-rus_Cyrl": { - "number_of_characters": 85.16333333333333, + "number_of_characters": 75823, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0924037037037037, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 153.375, + "max_document_length": 196, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "shn_Mymr-shn_Mymr": { - "number_of_characters": 77.90222222222222, + "number_of_characters": 69288, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0843358024691358, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 139.98360655737704, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "shn_Mymr-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-shn_Mymr": { - "number_of_characters": 77.90222222222222, + "number_of_characters": 69288, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0843358024691358, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 139.98360655737704, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Latn-sin_Latn": { - "number_of_characters": 96.46666666666667, + "number_of_characters": 85996, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10496296296296297, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 174.22131147540983, + "max_document_length": 224, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-sin_Latn": { - "number_of_characters": 96.46666666666667, + "number_of_characters": 85996, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10496296296296297, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 174.22131147540983, + "max_document_length": 224, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Sinh-sin_Sinh": { - "number_of_characters": 71.91777777777777, + "number_of_characters": 63902, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07768641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 128.94672131147541, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Sinh-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-sin_Sinh": { - "number_of_characters": 71.91777777777777, + "number_of_characters": 63902, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07768641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 128.94672131147541, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "slk_Latn-slk_Latn": { - "number_of_characters": 70.5411111111111, + "number_of_characters": 62663, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07615679012345679, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 126.4077868852459, + "max_document_length": 146, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "slk_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-slk_Latn": { - "number_of_characters": 70.5411111111111, + "number_of_characters": 62663, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07615679012345679, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 126.4077868852459, + "max_document_length": 146, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "slv_Latn-slv_Latn": { - "number_of_characters": 70.79888888888888, + "number_of_characters": 62895, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0764432098765432, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 126.88319672131148, + "max_document_length": 176, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "slv_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-slv_Latn": { - "number_of_characters": 70.79888888888888, + "number_of_characters": 62895, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0764432098765432, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 126.88319672131148, + "max_document_length": 176, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sna_Latn-sna_Latn": { - "number_of_characters": 83.30700778642937, + "number_of_characters": 74071, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09044161044096703, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 149.78483606557376, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sna_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-sna_Latn": { - "number_of_characters": 83.30700778642937, + "number_of_characters": 74071, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09044161044096703, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 149.78483606557376, + "max_document_length": 191, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "snd_Arab-snd_Arab": { - "number_of_characters": 65.42333333333333, + "number_of_characters": 58057, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07047037037037036, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 116.96926229508196, + "max_document_length": 164, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "snd_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-snd_Arab": { - "number_of_characters": 65.42333333333333, + "number_of_characters": 58057, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07047037037037036, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 116.96926229508196, + "max_document_length": 164, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "som_Latn-som_Latn": { - "number_of_characters": 92.95777777777778, + "number_of_characters": 82838, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.1010641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 167.75, + "max_document_length": 201, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "som_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-som_Latn": { - "number_of_characters": 92.95777777777778, + "number_of_characters": 82838, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.1010641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 167.75, + "max_document_length": 201, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sot_Latn-sot_Latn": { - "number_of_characters": 85.13111111111111, + "number_of_characters": 75794, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0923679012345679, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 153.3155737704918, + "max_document_length": 186, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sot_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-sot_Latn": { - "number_of_characters": 85.13111111111111, + "number_of_characters": 75794, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0923679012345679, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 153.3155737704918, + "max_document_length": 186, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "spa_Latn-spa_Latn": { - "number_of_characters": 84.16, + "number_of_characters": 74920, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09128888888888889, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 151.52459016393442, + "max_document_length": 180, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "spa_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-spa_Latn": { - "number_of_characters": 84.16, + "number_of_characters": 74920, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09128888888888889, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 151.52459016393442, + "max_document_length": 180, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "srp_Cyrl-srp_Cyrl": { - "number_of_characters": 69.49833147942158, + "number_of_characters": 61657, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07508157005497394, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 13, + "average_document_length": 124.34631147540983, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "srp_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-srp_Cyrl": { - "number_of_characters": 69.49833147942158, + "number_of_characters": 61657, "num_samples": 1387, "num_queries": 899, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07508157005497394, - "average_relevant_docs_per_query": 1.0011123470522802 + "min_document_length": 13, + "average_document_length": 124.34631147540983, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.085650723025584, + "max_query_length": 2, + "unique_queries": 899, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0011123470522802, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 488 }, "ssw_Latn-ssw_Latn": { - "number_of_characters": 83.09777777777778, + "number_of_characters": 73964, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09010864197530864, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 149.5655737704918, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ssw_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ssw_Latn": { - "number_of_characters": 83.09777777777778, + "number_of_characters": 73964, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09010864197530864, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 149.5655737704918, + "max_document_length": 182, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sun_Latn-sun_Latn": { - "number_of_characters": 80.16, + "number_of_characters": 71320, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08684444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 144.14754098360655, + "max_document_length": 173, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sun_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-sun_Latn": { - "number_of_characters": 80.16, + "number_of_characters": 71320, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08684444444444445, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 144.14754098360655, + "max_document_length": 173, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "swe_Latn-swe_Latn": { - "number_of_characters": 70.67666666666666, + "number_of_characters": 62785, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07630740740740741, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 126.6577868852459, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "swe_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-swe_Latn": { - "number_of_characters": 70.67666666666666, + "number_of_characters": 62785, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07630740740740741, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 126.6577868852459, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "swh_Latn-swh_Latn": { - "number_of_characters": 82.56, + "number_of_characters": 73480, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08951111111111111, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 148.5737704918033, + "max_document_length": 194, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "swh_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-swh_Latn": { - "number_of_characters": 82.56, + "number_of_characters": 73480, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08951111111111111, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 148.5737704918033, + "max_document_length": 194, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tam_Taml-tam_Taml": { - "number_of_characters": 83.12777777777778, + "number_of_characters": 73991, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09014197530864197, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 149.62090163934425, + "max_document_length": 181, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tam_Taml-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tam_Taml": { - "number_of_characters": 83.12777777777778, + "number_of_characters": 73991, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09014197530864197, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 149.62090163934425, + "max_document_length": 181, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tel_Telu-tel_Telu": { - "number_of_characters": 74.18777777777778, + "number_of_characters": 65945, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08020864197530865, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 133.13319672131146, + "max_document_length": 149, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tel_Telu-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tel_Telu": { - "number_of_characters": 74.18777777777778, + "number_of_characters": 65945, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08020864197530865, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 133.13319672131146, + "max_document_length": 149, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tgk_Cyrl-tgk_Cyrl": { - "number_of_characters": 76.28111111111112, + "number_of_characters": 67829, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08253456790123458, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 136.99385245901638, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tgk_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tgk_Cyrl": { - "number_of_characters": 76.28111111111112, + "number_of_characters": 67829, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08253456790123458, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 136.99385245901638, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tgl_Latn-tgl_Latn": { - "number_of_characters": 84.34555555555555, + "number_of_characters": 75087, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09149506172839506, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 151.86680327868854, + "max_document_length": 184, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tgl_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tgl_Latn": { - "number_of_characters": 84.34555555555555, + "number_of_characters": 75087, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09149506172839506, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 151.86680327868854, + "max_document_length": 184, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tha_Thai-tha_Thai": { - "number_of_characters": 61.46666666666667, + "number_of_characters": 54496, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06607407407407408, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 109.67213114754098, + "max_document_length": 123, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tha_Thai-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tha_Thai": { - "number_of_characters": 61.46666666666667, + "number_of_characters": 54496, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06607407407407408, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 109.67213114754098, + "max_document_length": 123, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tir_Ethi-tir_Ethi": { - "number_of_characters": 53.99888888888889, + "number_of_characters": 47775, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.057776543209876546, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 95.89959016393442, + "max_document_length": 110, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tir_Ethi-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tir_Ethi": { - "number_of_characters": 53.99888888888889, + "number_of_characters": 47775, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.057776543209876546, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 95.89959016393442, + "max_document_length": 110, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tsn_Latn-tsn_Latn": { - "number_of_characters": 89.12777777777778, + "number_of_characters": 79391, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09680864197530864, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 160.68647540983608, + "max_document_length": 204, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tsn_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tsn_Latn": { - "number_of_characters": 89.12777777777778, + "number_of_characters": 79391, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.09680864197530864, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 160.68647540983608, + "max_document_length": 204, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tso_Latn-tso_Latn": { - "number_of_characters": 93.69444444444444, + "number_of_characters": 83501, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10188271604938272, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 169.10860655737704, + "max_document_length": 215, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tso_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tso_Latn": { - "number_of_characters": 93.69444444444444, + "number_of_characters": 83501, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10188271604938272, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 169.10860655737704, + "max_document_length": 215, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tur_Latn-tur_Latn": { - "number_of_characters": 73.56222222222222, + "number_of_characters": 65382, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07951358024691357, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 131.9795081967213, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "tur_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-tur_Latn": { - "number_of_characters": 73.56222222222222, + "number_of_characters": 65382, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07951358024691357, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 131.9795081967213, + "max_document_length": 158, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ukr_Cyrl-ukr_Cyrl": { - "number_of_characters": 74.08222222222223, + "number_of_characters": 65850, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08009135802469136, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 132.93852459016392, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ukr_Cyrl-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-ukr_Cyrl": { - "number_of_characters": 74.08222222222223, + "number_of_characters": 65850, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08009135802469136, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 132.93852459016392, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Arab-urd_Arab": { - "number_of_characters": 72.52666666666667, + "number_of_characters": 64450, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07836296296296297, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 130.06967213114754, + "max_document_length": 187, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Arab-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-urd_Arab": { - "number_of_characters": 72.52666666666667, + "number_of_characters": 64450, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07836296296296297, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 130.06967213114754, + "max_document_length": 187, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Latn-urd_Latn": { - "number_of_characters": 92.07, + "number_of_characters": 82039, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10007777777777777, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 166.1127049180328, + "max_document_length": 230, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-urd_Latn": { - "number_of_characters": 92.07, + "number_of_characters": 82039, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10007777777777777, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 166.1127049180328, + "max_document_length": 230, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "uzn_Latn-uzn_Latn": { - "number_of_characters": 79.61333333333333, + "number_of_characters": 70828, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08623703703703703, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 143.13934426229508, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "uzn_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-uzn_Latn": { - "number_of_characters": 79.61333333333333, + "number_of_characters": 70828, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08623703703703703, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 143.13934426229508, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "vie_Latn-vie_Latn": { - "number_of_characters": 75.05333333333333, + "number_of_characters": 66724, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08117037037037036, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 134.7295081967213, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "vie_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-vie_Latn": { - "number_of_characters": 75.05333333333333, + "number_of_characters": 66724, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08117037037037036, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 134.7295081967213, + "max_document_length": 161, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "war_Latn-war_Latn": { - "number_of_characters": 88.07555555555555, + "number_of_characters": 78444, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0956395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 158.74590163934425, + "max_document_length": 207, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "war_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-war_Latn": { - "number_of_characters": 88.07555555555555, + "number_of_characters": 78444, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0956395061728395, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 158.74590163934425, + "max_document_length": 207, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "wol_Latn-wol_Latn": { - "number_of_characters": 72.60555555555555, + "number_of_characters": 64521, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07845061728395061, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 130.21516393442624, + "max_document_length": 139, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "wol_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-wol_Latn": { - "number_of_characters": 72.60555555555555, + "number_of_characters": 64521, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07845061728395061, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 130.21516393442624, + "max_document_length": 139, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "xho_Latn-xho_Latn": { - "number_of_characters": 80.50333333333333, + "number_of_characters": 71629, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08722592592592593, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 144.78073770491804, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "xho_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-xho_Latn": { - "number_of_characters": 80.50333333333333, + "number_of_characters": 71629, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08722592592592593, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 144.78073770491804, + "max_document_length": 179, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "yor_Latn-yor_Latn": { - "number_of_characters": 70.64, + "number_of_characters": 62752, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07626666666666666, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 126.59016393442623, + "max_document_length": 143, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "yor_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-yor_Latn": { - "number_of_characters": 70.64, + "number_of_characters": 62752, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07626666666666666, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 11, + "average_document_length": 126.59016393442623, + "max_document_length": 143, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zho_Hans-zho_Hans": { - "number_of_characters": 23.747777777777777, + "number_of_characters": 20549, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.024164197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 7, + "average_document_length": 40.10860655737705, + "max_document_length": 64, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zho_Hans-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-zho_Hans": { - "number_of_characters": 23.747777777777777, + "number_of_characters": 20549, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.024164197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 7, + "average_document_length": 40.10860655737705, + "max_document_length": 64, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zho_Hant-zho_Hant": { - "number_of_characters": 23.07888888888889, + "number_of_characters": 19947, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.02342098765432099, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 7, + "average_document_length": 38.875, + "max_document_length": 45, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zho_Hant-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-zho_Hant": { - "number_of_characters": 23.07888888888889, + "number_of_characters": 19947, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.02342098765432099, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 7, + "average_document_length": 38.875, + "max_document_length": 45, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zsm_Latn-zsm_Latn": { - "number_of_characters": 80.92444444444445, + "number_of_characters": 72008, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08769382716049383, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 145.55737704918033, + "max_document_length": 210, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zsm_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-zsm_Latn": { - "number_of_characters": 80.92444444444445, + "number_of_characters": 72008, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08769382716049383, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 145.55737704918033, + "max_document_length": 210, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zul_Latn-zul_Latn": { - "number_of_characters": 78.0411111111111, + "number_of_characters": 69413, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08449012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 140.23975409836066, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "zul_Latn-eng_Latn": { - "number_of_characters": 79.34777777777778, + "number_of_characters": 70589, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08594197530864198, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 142.64959016393442, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "eng_Latn-zul_Latn": { - "number_of_characters": 78.0411111111111, + "number_of_characters": 69413, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08449012345679012, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 140.23975409836066, + "max_document_length": 171, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Arab-arb_Latn": { - "number_of_characters": 69.02444444444444, + "number_of_characters": 61298, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.0744716049382716, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 12, + "average_document_length": 123.61065573770492, + "max_document_length": 160, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "arb_Latn-arb_Arab": { - "number_of_characters": 60.55, + "number_of_characters": 53671, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.06505555555555555, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 107.98155737704919, + "max_document_length": 134, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Beng-ben_Latn": { - "number_of_characters": 76.78777777777778, + "number_of_characters": 68285, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08309753086419754, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 137.92827868852459, + "max_document_length": 185, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "ben_Latn-ben_Beng": { - "number_of_characters": 71.48444444444445, + "number_of_characters": 63512, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07720493827160495, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 9, + "average_document_length": 128.14754098360655, + "max_document_length": 175, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Deva-hin_Latn": { - "number_of_characters": 76.81222222222222, + "number_of_characters": 68307, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08312469135802468, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 137.9733606557377, + "max_document_length": 170, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "hin_Latn-hin_Deva": { - "number_of_characters": 74.61777777777777, + "number_of_characters": 66332, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.08068641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 133.9262295081967, + "max_document_length": 165, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Deva-npi_Latn": { - "number_of_characters": 73.89666666666666, + "number_of_characters": 65683, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07988518518518518, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 132.59631147540983, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "npi_Latn-npi_Deva": { - "number_of_characters": 68.89666666666666, + "number_of_characters": 61183, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07432962962962962, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 123.375, + "max_document_length": 154, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Sinh-sin_Latn": { - "number_of_characters": 96.46666666666667, + "number_of_characters": 85996, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10496296296296297, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 174.22131147540983, + "max_document_length": 224, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "sin_Latn-sin_Sinh": { - "number_of_characters": 71.91777777777777, + "number_of_characters": 63902, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07768641975308642, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 17, + "average_document_length": 128.94672131147541, + "max_document_length": 159, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Arab-urd_Latn": { - "number_of_characters": 92.07, + "number_of_characters": 82039, "num_samples": 1388, "num_queries": 900, "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.10007777777777777, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 15, + "average_document_length": 166.1127049180328, + "max_document_length": 230, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 }, "urd_Latn-urd_Arab": { - "number_of_characters": 72.52666666666667, - "num_samples": 1388, - "num_queries": 900, - "num_documents": 488, - "average_document_length": 0.004098360655737705, - "average_query_length": 0.07836296296296297, - "average_relevant_docs_per_query": 1.0 + "number_of_characters": 64450, + "num_samples": 1388, + "num_queries": 900, + "num_documents": 488, + "min_document_length": 11, + "average_document_length": 130.06967213114754, + "max_document_length": 187, + "unique_documents": 488, + "min_query_length": 2, + "average_query_length": 1.0844444444444445, + "max_query_length": 2, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 488 } } } diff --git a/mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json b/mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json index e9ff1f787..3d27f624b 100644 --- a/mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json @@ -1,66 +1,129 @@ { "test": { - "number_of_characters": 664.7686497593272, + "number_of_characters": 36843313, "num_samples": 1056326, "num_queries": 52561, "num_documents": 1003765, - "average_document_length": 1.9924982441109223e-06, - "average_query_length": 0.012609513703303347, + "min_document_length": 54, + "average_document_length": 34.70511822986456, + "max_document_length": 334374, + "unique_documents": 1003765, + "min_query_length": 2, + "average_query_length": 38.19428854093339, + "max_query_length": 2, + "unique_queries": 52561, + "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 52561, "hf_subset_descriptive_stats": { "python": { - "number_of_characters": 941.4041426464673, + "number_of_characters": 14574651, "num_samples": 295228, "num_queries": 14918, "num_documents": 280310, - "average_document_length": 7.134957725375477e-06, - "average_query_length": 0.0629711853228628, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 95, + "average_document_length": 49.994759373550714, + "max_document_length": 14008, + "unique_documents": 280310, + "min_query_length": 2, + "average_query_length": 37.5801045716584, + "max_query_length": 2, + "unique_queries": 14918, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 14918 }, "javascript": { - "number_of_characters": 748.8343968398663, + "number_of_characters": 2587540, "num_samples": 68145, "num_queries": 3291, "num_documents": 64854, - "average_document_length": 3.0838498781879296e-05, - "average_query_length": 0.2269323600242681, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 87, + "average_document_length": 37.89792456903198, + "max_document_length": 334374, + "unique_documents": 64854, + "min_query_length": 2, + "average_query_length": 39.412944393801276, + "max_query_length": 2, + "unique_queries": 3291, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 3291 }, "go": { - "number_of_characters": 405.3770007387343, + "number_of_characters": 3641108, "num_samples": 190562, "num_queries": 8122, "num_documents": 182440, - "average_document_length": 1.0962508221881167e-05, - "average_query_length": 0.049664737840277556, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 54, + "average_document_length": 17.957838193378645, + "max_document_length": 5280, + "unique_documents": 182440, + "min_query_length": 2, + "average_query_length": 44.9248953459739, + "max_query_length": 2, + "unique_queries": 8122, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 8122 }, "ruby": { - "number_of_characters": 457.43695479777955, + "number_of_characters": 629446, "num_samples": 28831, "num_queries": 1261, "num_documents": 27570, - "average_document_length": 7.254261878853827e-05, - "average_query_length": 0.3611712567785722, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 83, + "average_document_length": 20.830830612985128, + "max_document_length": 3992, + "unique_documents": 27570, + "min_query_length": 2, + "average_query_length": 43.72720063441713, + "max_query_length": 2, + "unique_queries": 1261, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1261 }, "java": { - "number_of_characters": 588.8922866271109, + "number_of_characters": 6791137, "num_samples": 191821, "num_queries": 10955, "num_documents": 180866, - "average_document_length": 1.1057910276116019e-05, - "average_query_length": 0.053573006538303145, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 77, + "average_document_length": 35.54789180940586, + "max_document_length": 7615, + "unique_documents": 180866, + "min_query_length": 2, + "average_query_length": 33.019808306709265, + "max_query_length": 2, + "unique_queries": 10955, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 10955 }, "php": { - "number_of_characters": 578.8503639217925, + "number_of_characters": 8619431, "num_samples": 281739, "num_queries": 14014, "num_documents": 267725, - "average_document_length": 7.470352040339901e-06, - "average_query_length": 0.041162434987997175, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 94, + "average_document_length": 30.195091978709495, + "max_document_length": 4904, + "unique_documents": 267725, + "min_query_length": 2, + "average_query_length": 38.20822035107749, + "max_query_length": 2, + "unique_queries": 14014, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 14014 } } } diff --git a/mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json b/mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json index a0325c638..6d73096d4 100644 --- a/mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json @@ -1,129 +1,255 @@ { "train": { - "number_of_characters": 71.98776923076923, + "number_of_characters": 935841, "num_samples": 26000, "num_queries": 13000, "num_documents": 13000, - "average_document_length": 7.692307692307693e-05, - "average_query_length": 0.005460597633136095, + "min_document_length": 18, + "average_document_length": 70.98776923076923, + "max_document_length": 2532, + "unique_documents": 13000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 13000, + "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 13000, "hf_subset_descriptive_stats": { "python": { - "number_of_characters": 70.519, + "number_of_characters": 70519, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.06951900000000001, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 21, + "average_document_length": 69.519, + "max_document_length": 1811, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "javascript": { - "number_of_characters": 57.88, + "number_of_characters": 57880, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.05688, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 56.88, + "max_document_length": 601, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "typescript": { - "number_of_characters": 61.092, + "number_of_characters": 61092, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.060092, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 60.092, + "max_document_length": 659, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "go": { - "number_of_characters": 71.797, + "number_of_characters": 71797, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.070797, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 70.797, + "max_document_length": 1529, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "ruby": { - "number_of_characters": 67.9, + "number_of_characters": 67900, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.0669, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 66.9, + "max_document_length": 751, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "java": { - "number_of_characters": 63.984, + "number_of_characters": 63984, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.062984, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 23, + "average_document_length": 62.984, + "max_document_length": 807, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "php": { - "number_of_characters": 62.927, + "number_of_characters": 62927, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.061927, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 21, + "average_document_length": 61.927, + "max_document_length": 766, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "c": { - "number_of_characters": 98.588, + "number_of_characters": 98588, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.097588, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 20, + "average_document_length": 97.588, + "max_document_length": 1672, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "c++": { - "number_of_characters": 115.48, + "number_of_characters": 115480, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.11448, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 22, + "average_document_length": 114.48, + "max_document_length": 1856, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "rust": { - "number_of_characters": 68.503, + "number_of_characters": 68503, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.067503, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 67.503, + "max_document_length": 2532, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "swift": { - "number_of_characters": 58.279, + "number_of_characters": 58279, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.057279000000000004, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 19, + "average_document_length": 57.279, + "max_document_length": 727, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "scala": { - "number_of_characters": 65.833, + "number_of_characters": 65833, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.064833, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 22, + "average_document_length": 64.833, + "max_document_length": 685, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "shell": { - "number_of_characters": 73.059, + "number_of_characters": 73059, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.072059, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 72.059, + "max_document_length": 813, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 } } } diff --git a/mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json b/mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json index c49801192..1be18319c 100644 --- a/mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json +++ b/mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 5894.401984777533, + "number_of_characters": 156266302, "num_samples": 79660, "num_queries": 13277, "num_documents": 66383, - "average_document_length": 0.022127347788495202, - "average_query_length": 0.3333224566192555, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 127, + "average_document_length": 885.131117906693, + "max_document_length": 32432, + "unique_documents": 66383, + "min_query_length": 2, + "average_query_length": 7344.177374406869, + "max_query_length": 9403, + "unique_queries": 13277, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 13277 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CodeFeedbackST.json b/mteb/descriptive_stats/Retrieval/CodeFeedbackST.json index a7e653149..4511605dd 100644 --- a/mteb/descriptive_stats/Retrieval/CodeFeedbackST.json +++ b/mteb/descriptive_stats/Retrieval/CodeFeedbackST.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 2246.575885305433, + "number_of_characters": 260957682, "num_samples": 187832, "num_queries": 31306, "num_documents": 156526, - "average_document_length": 0.009725743421916316, - "average_query_length": 0.02313435668710662, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 26, + "average_document_length": 144.85253568097312, + "max_document_length": 13851, + "unique_documents": 156526, + "min_query_length": 1, + "average_query_length": 7611.464064396601, + "max_query_length": 11354, + "unique_queries": 31306, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 31306 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json b/mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json index 96802f81d..a817119b4 100644 --- a/mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json @@ -1,66 +1,129 @@ { "test": { - "number_of_characters": 390.06276516809044, + "number_of_characters": 22407915, "num_samples": 1058035, "num_queries": 52561, "num_documents": 1005474, - "average_document_length": 1.9891116030847143e-06, - "average_query_length": 0.007383093266263778, + "min_document_length": 23, + "average_document_length": 20.28592186371801, + "max_document_length": 214210, + "unique_documents": 1005474, + "min_query_length": 2, + "average_query_length": 38.259317745096176, + "max_query_length": 2, + "unique_queries": 52561, + "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 52561, "hf_subset_descriptive_stats": { "python": { - "number_of_characters": 553.7934039415471, + "number_of_characters": 8792958, "num_samples": 295570, "num_queries": 14918, "num_documents": 280652, - "average_document_length": 7.126263130139817e-06, - "average_query_length": 0.0369884303486759, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 38, + "average_document_length": 29.330466200133973, + "max_document_length": 8326, + "unique_documents": 280652, + "min_query_length": 2, + "average_query_length": 37.62595522187961, + "max_query_length": 2, + "unique_queries": 14918, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 14918 }, "javascript": { - "number_of_characters": 445.70707991491946, + "number_of_characters": 1590642, "num_samples": 68492, "num_queries": 3291, "num_documents": 65201, - "average_document_length": 3.0674376159874846e-05, - "average_query_length": 0.1348243937754237, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 40, + "average_document_length": 22.395975521847824, + "max_document_length": 214210, + "unique_documents": 65201, + "min_query_length": 2, + "average_query_length": 39.6238225463385, + "max_query_length": 2, + "unique_queries": 3291, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 3291 }, "go": { - "number_of_characters": 235.76803742920464, + "number_of_characters": 2264134, "num_samples": 190857, "num_queries": 8122, "num_documents": 182735, - "average_document_length": 1.0944810791583441e-05, - "average_query_length": 0.028782077989313547, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 23, + "average_document_length": 10.390259118395491, + "max_document_length": 3589, + "unique_documents": 182735, + "min_query_length": 2, + "average_query_length": 44.99753755232701, + "max_query_length": 2, + "unique_queries": 8122, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 8122 }, "ruby": { - "number_of_characters": 268.8731165741475, + "number_of_characters": 391703, "num_samples": 28849, "num_queries": 1261, "num_documents": 27588, - "average_document_length": 7.24952878062926e-05, - "average_query_length": 0.21163609561788066, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 36, + "average_document_length": 12.198310859794113, + "max_document_length": 2244, + "unique_documents": 27588, + "min_query_length": 2, + "average_query_length": 43.75574940523394, + "max_query_length": 2, + "unique_queries": 1261, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1261 }, "java": { - "number_of_characters": 344.5341853035144, + "number_of_characters": 4114584, "num_samples": 192016, "num_queries": 10955, "num_documents": 181061, - "average_document_length": 1.1046001071462105e-05, - "average_query_length": 0.03126738341428703, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 38, + "average_document_length": 20.724849636310413, + "max_document_length": 5066, + "unique_documents": 181061, + "min_query_length": 2, + "average_query_length": 33.055408489274306, + "max_query_length": 2, + "unique_queries": 10955, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 10955 }, "php": { - "number_of_characters": 338.62194947909234, + "number_of_characters": 5253894, "num_samples": 282251, "num_queries": 14014, "num_documents": 268237, - "average_document_length": 7.456092932742314e-06, - "average_query_length": 0.024020404558234076, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 40, + "average_document_length": 17.586760961388624, + "max_document_length": 2995, + "unique_documents": 268237, + "min_query_length": 2, + "average_query_length": 38.28129013843299, + "max_query_length": 2, + "unique_queries": 14014, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 14014 } } } diff --git a/mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json b/mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json index 188d4eb7e..853c4c79c 100644 --- a/mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json @@ -1,66 +1,129 @@ { "test": { - "number_of_characters": 325.01233333333334, + "number_of_characters": 1950074, "num_samples": 12000, "num_queries": 6000, "num_documents": 6000, - "average_document_length": 0.00016666666666666666, - "average_query_length": 0.05400205555555556, + "min_document_length": 2, + "average_document_length": 324.01233333333334, + "max_document_length": 17533, + "unique_documents": 6000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 6000, + "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 6000, "hf_subset_descriptive_stats": { "python": { - "number_of_characters": 467.546, + "number_of_characters": 467546, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.466546, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 8, + "average_document_length": 466.546, + "max_document_length": 8636, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "javascript": { - "number_of_characters": 187.018, + "number_of_characters": 187018, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.186018, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 2, + "average_document_length": 186.018, + "max_document_length": 7657, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "go": { - "number_of_characters": 126.213, + "number_of_characters": 126213, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.125213, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 14, + "average_document_length": 125.213, + "max_document_length": 1501, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "ruby": { - "number_of_characters": 314.818, + "number_of_characters": 314818, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.313818, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 5, + "average_document_length": 313.818, + "max_document_length": 17533, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "java": { - "number_of_characters": 691.36, + "number_of_characters": 691360, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.69036, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 2, + "average_document_length": 690.36, + "max_document_length": 6473, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 }, "php": { - "number_of_characters": 163.119, + "number_of_characters": 163119, "num_samples": 2000, "num_queries": 1000, "num_documents": 1000, - "average_document_length": 0.001, - "average_query_length": 0.162119, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 5, + "average_document_length": 162.119, + "max_document_length": 1240, + "unique_documents": 1000, + "min_query_length": 1, + "average_query_length": 1.0, + "max_query_length": 1, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 } } } diff --git a/mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json b/mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json index da6aa8174..07081e69c 100644 --- a/mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json +++ b/mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 2520.6537967751206, + "number_of_characters": 1744286, "num_samples": 1229, "num_queries": 221, "num_documents": 1008, - "average_document_length": 1.4965681295666415, - "average_query_length": 4.57969738539342, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 8, + "average_document_length": 221.90178571428572, + "max_document_length": 4147, + "unique_documents": 1008, + "min_query_length": 8, + "average_query_length": 6880.58371040724, + "max_query_length": 10852, + "unique_queries": 221, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 221 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json b/mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json index cf266671f..042658caa 100644 --- a/mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json +++ b/mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 3347.695751633987, + "number_of_characters": 1543912, "num_samples": 996, "num_queries": 180, "num_documents": 816, - "average_document_length": 1.8138155997693195, - "average_query_length": 10.37567901234568, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 376, + "average_document_length": 411.97549019607845, + "max_document_length": 8285, + "unique_documents": 816, + "min_query_length": 58, + "average_query_length": 6709.666666666667, + "max_query_length": 8469, + "unique_queries": 180, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 180 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CosQA.json b/mteb/descriptive_stats/Retrieval/CosQA.json index 5dd3a9637..d8f17d4b2 100644 --- a/mteb/descriptive_stats/Retrieval/CosQA.json +++ b/mteb/descriptive_stats/Retrieval/CosQA.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 313.946741215298, + "number_of_characters": 5728450, "num_samples": 21104, "num_queries": 500, "num_documents": 20604, - "average_document_length": 0.013450433955314403, - "average_query_length": 0.073628, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 18, + "average_document_length": 0.8933702193748787, + "max_document_length": 83, + "unique_documents": 20604, + "min_query_length": 88, + "average_query_length": 11420.086, + "max_query_length": 6396, + "unique_queries": 500, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 500 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/JaqketRetrieval.json b/mteb/descriptive_stats/Retrieval/JaqketRetrieval.json index 6498bb736..4598b2af7 100644 --- a/mteb/descriptive_stats/Retrieval/JaqketRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/JaqketRetrieval.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 3799.701347237398, + "number_of_characters": 428294530, "num_samples": 115226, "num_queries": 997, "num_documents": 114229, - "average_document_length": 0.03281999517532617, - "average_query_length": 0.050858694438380335, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 16, + "average_document_length": 0.4425671239352529, + "max_document_length": 98, + "unique_documents": 114229, + "min_query_length": 8, + "average_query_length": 429532.5737211635, + "max_query_length": 188424, + "unique_queries": 997, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 989 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NFCorpus.json b/mteb/descriptive_stats/Retrieval/NFCorpus.json new file mode 100644 index 000000000..94df0b0cf --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NFCorpus.json @@ -0,0 +1,11 @@ +{ + "test": { + "number_of_characters": 1612.5486310130989, + "num_samples": 3956, + "num_queries": 323, + "num_documents": 3633, + "average_document_length": 0.43787060972495073, + "average_query_length": 0.06738299034784193, + "average_relevant_docs_per_query": 38.18575851393189 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/StackOverflowQA.json b/mteb/descriptive_stats/Retrieval/StackOverflowQA.json index d0949feac..51972461e 100644 --- a/mteb/descriptive_stats/Retrieval/StackOverflowQA.json +++ b/mteb/descriptive_stats/Retrieval/StackOverflowQA.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 2506.1079405241967, + "number_of_characters": 26584028, "num_samples": 21925, "num_queries": 1994, "num_documents": 19931, - "average_document_length": 0.060382397340162784, - "average_query_length": 0.6532730085944896, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 61, + "average_document_length": 130.32145903366614, + "max_document_length": 22234, + "unique_documents": 19931, + "min_query_length": 5, + "average_query_length": 12029.38365095286, + "max_query_length": 46028, + "unique_queries": 1994, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1994 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json b/mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json index c833692b9..56c3964a5 100644 --- a/mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json +++ b/mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 210.9770886090626, + "number_of_characters": 14041553, "num_samples": 111702, "num_queries": 5851, "num_documents": 105851, - "average_document_length": 0.0012099201759594499, - "average_query_length": 0.014169514281931103, - "average_relevant_docs_per_query": 1.0 + "min_document_length": 13, + "average_document_length": 4.582686984534865, + "max_document_length": 281, + "unique_documents": 105851, + "min_query_length": 17, + "average_query_length": 2316.9494103572038, + "max_query_length": 762, + "unique_queries": 5851, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 5851 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/Touche2020.json b/mteb/descriptive_stats/Retrieval/Touche2020.json index 76798710b..a3c37a54e 100644 --- a/mteb/descriptive_stats/Retrieval/Touche2020.json +++ b/mteb/descriptive_stats/Retrieval/Touche2020.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 1763.7633372731125, + "number_of_characters": 658107591, "num_samples": 382594, "num_queries": 49, "num_documents": 382545, - "average_document_length": 0.00449707816294695, - "average_query_length": 0.8862973760932945, - "average_relevant_docs_per_query": 19.020408163265305 + "min_document_length": 16, + "average_document_length": 0.0055627442523101854, + "max_document_length": 83, + "unique_documents": 382545, + "min_query_length": 3, + "average_query_length": 13430723.734693877, + "max_query_length": 106072, + "unique_queries": 49, + "min_relevant_docs_per_query": 40, + "average_relevant_docs_per_query": 45.183673469387756, + "max_relevant_docs_per_query": 52, + "unique_relevant_docs": 2099 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json b/mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json index 3d04c572c..1b436abd7 100644 --- a/mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json +++ b/mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json @@ -1,11 +1,20 @@ { "test": { - "number_of_characters": 2140.8203839475027, + "number_of_characters": 637047138, "num_samples": 303781, "num_queries": 49, "num_documents": 303732, - "average_document_length": 0.006905402830518125, - "average_query_length": 0.8862973760932945, - "average_relevant_docs_per_query": 34.93877551020408 + "min_document_length": 16, + "average_document_length": 0.007006176497701922, + "max_document_length": 83, + "unique_documents": 303732, + "min_query_length": 41, + "average_query_length": 13000918.57142857, + "max_query_length": 105983, + "unique_queries": 49, + "min_relevant_docs_per_query": 40, + "average_relevant_docs_per_query": 58.142857142857146, + "max_relevant_docs_per_query": 87, + "unique_relevant_docs": 2732 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json b/mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json index 6a48e9b08..f23a5ea1b 100644 --- a/mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json @@ -4,48 +4,112 @@ "num_docs": 121635, "num_queries": 123, "number_of_characters": 283654099, + "min_document_length": 74, "average_document_length": 2331.0777818884367, + "max_document_length": 24179, + "unique_docs": 121635, + "min_query_length": 32, "average_query_length": 81.8780487804878, + "max_query_length": 173, + "unique_queries": 75, + "min_instruction_length": 93, "average_instruction_length": 389.9512195121951, + "max_instruction_length": 887, + "unique_instructions": 75, + "min_changed_instruction_length": 180, "average_changed_instruction_length": 450.5528455284553, + "max_changed_instruction_length": 974, + "unique_changed_instructions": 123, + "min_average_relevant_docs_per_query": 0, "average_relevant_docs_per_query": 10.43089430894309, + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000, "hf_subset_descriptive_stats": { "eng-fas": { "num_samples": 41229, "num_docs": 41189, "num_queries": 40, "number_of_characters": 129597567, + "min_document_length": 99, "average_document_length": 3145.4990895627475, + "max_document_length": 24179, + "unique_docs": 41189, + "min_query_length": 34, "average_query_length": 80.075, + "max_query_length": 124, + "unique_queries": 40, + "min_instruction_length": 150, "average_instruction_length": 396.875, + "max_instruction_length": 887, + "unique_instructions": 40, + "min_changed_instruction_length": 205, "average_changed_instruction_length": 463.175, + "max_changed_instruction_length": 974, + "unique_changed_instructions": 40, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 10.85, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 22, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 }, "eng-rus": { "num_samples": 39366, "num_docs": 39326, "num_queries": 40, "number_of_characters": 109522175, + "min_document_length": 75, "average_document_length": 2784.0813456746173, + "max_document_length": 24061, + "unique_docs": 39326, + "min_query_length": 32, "average_query_length": 81.875, + "max_query_length": 173, + "unique_queries": 40, + "min_instruction_length": 93, "average_instruction_length": 371.125, + "max_instruction_length": 887, + "unique_instructions": 40, + "min_changed_instruction_length": 180, "average_changed_instruction_length": 431.8, + "max_changed_instruction_length": 957, + "unique_changed_instructions": 40, + "min_average_relevant_docs_per_query": 0, "average_relevant_docs_per_query": 9.775, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 }, "eng-zho": { "num_samples": 41163, "num_docs": 41120, "num_queries": 43, "number_of_characters": 44534357, + "min_document_length": 74, "average_document_length": 1082.0501215953307, + "max_document_length": 23840, + "unique_docs": 41120, + "min_query_length": 32, "average_query_length": 83.55813953488372, + "max_query_length": 159, + "unique_queries": 43, + "min_instruction_length": 157, "average_instruction_length": 401.0232558139535, + "max_instruction_length": 731, + "unique_instructions": 43, + "min_changed_instruction_length": 209, "average_changed_instruction_length": 456.25581395348837, + "max_changed_instruction_length": 822, + "unique_changed_instructions": 43, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 10.651162790697674, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 } } } diff --git a/mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json b/mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json index 893dfde70..54ae5d1ec 100644 --- a/mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json +++ b/mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json @@ -4,48 +4,112 @@ "num_docs": 121635, "num_queries": 123, "number_of_characters": 283622456, + "min_document_length": 74, "average_document_length": 2331.0777818884367, + "max_document_length": 24179, + "unique_docs": 121635, + "min_query_length": 10, "average_query_length": 57.113821138211385, + "max_query_length": 136, + "unique_queries": 123, + "min_instruction_length": 37, "average_instruction_length": 281.0650406504065, + "max_instruction_length": 1009, + "unique_instructions": 123, + "min_changed_instruction_length": 44, "average_changed_instruction_length": 326.9430894308943, + "max_changed_instruction_length": 1083, + "unique_changed_instructions": 123, + "min_average_relevant_docs_per_query": 0, "average_relevant_docs_per_query": 10.43089430894309, + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000, "hf_subset_descriptive_stats": { "fas": { "num_samples": 41229, "num_docs": 41189, "num_queries": 40, "number_of_characters": 129593838, + "min_document_length": 99, "average_document_length": 3145.4990895627475, + "max_document_length": 24179, + "unique_docs": 41189, + "min_query_length": 34, "average_query_length": 72.65, + "max_query_length": 124, + "unique_queries": 40, + "min_instruction_length": 121, "average_instruction_length": 358.925, + "max_instruction_length": 759, + "unique_instructions": 40, + "min_changed_instruction_length": 163, "average_changed_instruction_length": 415.325, + "max_changed_instruction_length": 842, + "unique_changed_instructions": 40, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 10.85, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 22, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 }, "rus": { "num_samples": 39366, "num_docs": 39326, "num_queries": 40, "number_of_characters": 109523683, + "min_document_length": 75, "average_document_length": 2784.0813456746173, + "max_document_length": 24061, + "unique_docs": 39326, + "min_query_length": 26, "average_query_length": 77.5, + "max_query_length": 136, + "unique_queries": 40, + "min_instruction_length": 78, "average_instruction_length": 387.0, + "max_instruction_length": 1009, + "unique_instructions": 40, + "min_changed_instruction_length": 187, "average_changed_instruction_length": 458.0, + "max_changed_instruction_length": 1083, + "unique_changed_instructions": 40, + "min_average_relevant_docs_per_query": 0, "average_relevant_docs_per_query": 9.775, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 }, "zho": { "num_samples": 41163, "num_docs": 41120, "num_queries": 43, "number_of_characters": 44504935, + "min_document_length": 74, "average_document_length": 1082.0501215953307, + "max_document_length": 23840, + "unique_docs": 41120, + "min_query_length": 10, "average_query_length": 23.697674418604652, + "max_query_length": 44, + "unique_queries": 43, + "min_instruction_length": 37, "average_instruction_length": 110.09302325581395, + "max_instruction_length": 209, + "unique_instructions": 43, + "min_changed_instruction_length": 44, "average_changed_instruction_length": 122.81395348837209, + "max_changed_instruction_length": 229, + "unique_changed_instructions": 43, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 10.651162790697674, - "average_top_ranked_per_query": 1000.0 + "max_average_relevant_docs_per_query": 24, + "min_average_top_ranked_per_query": 1000, + "average_top_ranked_per_query": 1000.0, + "max_average_top_ranked_per_query": 1000 } } } diff --git a/mteb/descriptive_stats/STS/STS12.json b/mteb/descriptive_stats/STS/STS12.json index e9205c172..a7e11197a 100644 --- a/mteb/descriptive_stats/STS/STS12.json +++ b/mteb/descriptive_stats/STS/STS12.json @@ -2,8 +2,16 @@ "test": { "num_samples": 3108, "number_of_characters": 402118, + "min_sentence1_length": 3, "average_sentence1_len": 63.78893178893179, + "max_sentence1_length": 220, + "unique_sentence1": 2236, + "min_sentence2_length": 7, "average_sentence2_len": 65.5926640926641, - "avg_score": 3.5060643500643507 + "max_sentence2_length": 204, + "unique_sentence2": 2797, + "min_score": 0.0, + "avg_score": 3.5060643500643507, + "max_score": 5.0 } } \ No newline at end of file diff --git a/mteb/descriptive_stats/STS/STS17.json b/mteb/descriptive_stats/STS/STS17.json index 164cc9d1e..912738035 100644 --- a/mteb/descriptive_stats/STS/STS17.json +++ b/mteb/descriptive_stats/STS/STS17.json @@ -2,86 +2,182 @@ "test": { "num_samples": 5346, "number_of_characters": 400264, + "min_sentence1_length": 6, "average_sentence1_len": 38.14665170220726, + "max_sentence1_length": 976, + "unique_sentence1": 4900, + "min_sentence2_length": 6, "average_sentence2_len": 36.72502805836139, + "max_sentence2_length": 1007, + "unique_sentence2": 4470, + "min_score": 0.0, "avg_score": 2.3554804214989464, + "max_score": 5.0, "hf_subset_descriptive_stats": { "ko-ko": { "num_samples": 2846, "number_of_characters": 183387, + "min_sentence1_length": 6, "average_sentence1_len": 31.991918482080113, + "max_sentence1_length": 976, + "unique_sentence1": 2650, + "min_sentence2_length": 6, "average_sentence2_len": 32.44483485593816, - "avg_score": 2.469359920356055 + "max_sentence2_length": 1007, + "unique_sentence2": 2720, + "min_score": 0.0, + "avg_score": 2.469359920356055, + "max_score": 5.0 }, "ar-ar": { "num_samples": 250, "number_of_characters": 16247, + "min_sentence1_length": 11, "average_sentence1_len": 32.208, + "max_sentence1_length": 99, + "unique_sentence1": 250, + "min_sentence2_length": 9, "average_sentence2_len": 32.78, - "avg_score": 2.216800000000001 + "max_sentence2_length": 83, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.216800000000001, + "max_score": 5.0 }, "en-ar": { "num_samples": 250, "number_of_characters": 18764, + "min_sentence1_length": 13, "average_sentence1_len": 42.36, + "max_sentence1_length": 105, + "unique_sentence1": 250, + "min_sentence2_length": 10, "average_sentence2_len": 32.696, - "avg_score": 2.1423999999999994 + "max_sentence2_length": 104, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.1423999999999994, + "max_score": 5.0 }, "en-de": { "num_samples": 250, "number_of_characters": 22177, + "min_sentence1_length": 12, "average_sentence1_len": 43.952, + "max_sentence1_length": 94, + "unique_sentence1": 250, + "min_sentence2_length": 15, "average_sentence2_len": 44.756, - "avg_score": 2.2776000000000014 + "max_sentence2_length": 104, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 }, "en-en": { "num_samples": 250, "number_of_characters": 21669, + "min_sentence1_length": 12, "average_sentence1_len": 43.952, + "max_sentence1_length": 94, + "unique_sentence1": 250, + "min_sentence2_length": 15, "average_sentence2_len": 42.724, - "avg_score": 2.2776000000000014 + "max_sentence2_length": 101, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 }, "en-tr": { "num_samples": 250, "number_of_characters": 20879, + "min_sentence1_length": 15, "average_sentence1_len": 41.916, + "max_sentence1_length": 101, + "unique_sentence1": 250, + "min_sentence2_length": 10, "average_sentence2_len": 41.6, - "avg_score": 2.1335999999999986 + "max_sentence2_length": 107, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.1335999999999986, + "max_score": 5.0 }, "es-en": { "num_samples": 250, "number_of_characters": 23216, + "min_sentence1_length": 12, "average_sentence1_len": 50.84, + "max_sentence1_length": 160, + "unique_sentence1": 250, + "min_sentence2_length": 14, "average_sentence2_len": 42.024, - "avg_score": 2.1464000000000003 + "max_sentence2_length": 117, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.1464000000000003, + "max_score": 5.0 }, "es-es": { "num_samples": 250, "number_of_characters": 25265, + "min_sentence1_length": 18, "average_sentence1_len": 49.836, + "max_sentence1_length": 136, + "unique_sentence1": 250, + "min_sentence2_length": 13, "average_sentence2_len": 51.224, - "avg_score": 2.2312000000000007 + "max_sentence2_length": 129, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2312000000000007, + "max_score": 5.0 }, "fr-en": { "num_samples": 250, "number_of_characters": 23087, + "min_sentence1_length": 19, "average_sentence1_len": 49.624, + "max_sentence1_length": 115, + "unique_sentence1": 250, + "min_sentence2_length": 15, "average_sentence2_len": 42.724, - "avg_score": 2.2776000000000014 + "max_sentence2_length": 101, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 }, "it-en": { "num_samples": 250, "number_of_characters": 23188, + "min_sentence1_length": 15, "average_sentence1_len": 50.028, + "max_sentence1_length": 113, + "unique_sentence1": 250, + "min_sentence2_length": 15, "average_sentence2_len": 42.724, - "avg_score": 2.2776000000000014 + "max_sentence2_length": 101, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 }, "nl-en": { "num_samples": 250, "number_of_characters": 22385, + "min_sentence1_length": 14, "average_sentence1_len": 46.816, + "max_sentence1_length": 123, + "unique_sentence1": 250, + "min_sentence2_length": 15, "average_sentence2_len": 42.724, - "avg_score": 2.2776000000000014 + "max_sentence2_length": 101, + "unique_sentence2": 250, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 } } } diff --git a/mteb/descriptive_stats/Summarization/SummEval.json b/mteb/descriptive_stats/Summarization/SummEval.json index e9c0b172e..4c2f133ab 100644 --- a/mteb/descriptive_stats/Summarization/SummEval.json +++ b/mteb/descriptive_stats/Summarization/SummEval.json @@ -2,9 +2,54 @@ "test": { "num_samples": 100, "number_of_characters": 212735, - "avg_text_len": 2100.35, - "avg_human_summaries_len": 11.0, - "avg_machine_summaries_len": 16.0, - "avg_relevance": 3.7770833333333336 + "min_text_length": 626, + "avg_text_length": 2100.35, + "max_text_length": 3153, + "unique_texts": 100, + "min_human_summaries_length": 11, + "avg_human_summaries_length": 11.0, + "max_human_summaries_length": 11, + "unique_human_summaries": 1100, + "min_machine_summaries_length": 16, + "avg_machine_summaries_length": 16.0, + "max_machine_summaries_length": 16, + "unique_machine_summaries": 1548, + "min_relevance": [ + 1.0, + 1.3333333333333333, + 3.6666666666666665, + 2.3333333333333335, + 3.6666666666666665, + 3.0, + 4.333333333333333, + 4.0, + 2.6666666666666665, + 4.0, + 2.0, + 4.666666666666667, + 4.333333333333333, + 1.0, + 2.0, + 1.0 + ], + "avg_relevance": 3.7770833333333336, + "max_relevance": [ + 5.0, + 4.666666666666667, + 4.333333333333333, + 2.6666666666666665, + 4.666666666666667, + 4.666666666666667, + 4.666666666666667, + 4.333333333333333, + 4.0, + 4.333333333333333, + 4.666666666666667, + 4.666666666666667, + 4.333333333333333, + 2.3333333333333335, + 4.666666666666667, + 4.666666666666667 + ] } } \ No newline at end of file diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 2ac0096da..489b67ab4 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -52,10 +52,25 @@ class MockClassificationTask(AbsTaskClassification): "test": { "num_samples": 2, "number_of_characters": 52, + "num_texts_in_train": 1, + "min_text_length": 23, "average_text_length": 26.0, + "max_text_length": 29, + "unique_text": 2, "unique_labels": 2, "labels": {"0": {"count": 1}, "1": {"count": 1}}, - } + }, + "train": { + "num_samples": 2, + "number_of_characters": 53, + "num_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 1}, "1": {"count": 1}}, + }, } metadata = TaskMetadata( @@ -66,20 +81,22 @@ class MockClassificationTask(AbsTaskClassification): ) def load_data(self, **kwargs): - texts = ["This is a test sentence", "This is another test sentence"] + train_texts = ["This is a test sentence", "This is another train sentence"] + test_texts = ["This is a test sentence", "This is another test sentence"] + labels = [0, 1] self.dataset = DatasetDict( { "test": Dataset.from_dict( { - "text": texts, + "text": test_texts, "label": labels, } ), "train": Dataset.from_dict( { - "text": texts, + "text": train_texts, "label": labels, } ), @@ -93,26 +110,73 @@ class MockMultilingualClassificationTask(AbsTaskClassification, MultilingualTask "test": { "num_samples": 4, "number_of_characters": 104, + "num_texts_in_train": 1, + "min_text_length": 23, "average_text_length": 26.0, + "max_text_length": 29, + "unique_text": 2, "unique_labels": 2, "labels": {"0": {"count": 2}, "1": {"count": 2}}, "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 52, + "num_texts_in_train": 1, + "min_text_length": 23, "average_text_length": 26.0, + "max_text_length": 29, + "unique_text": 2, "unique_labels": 2, "labels": {"0": {"count": 1}, "1": {"count": 1}}, }, "fra": { "num_samples": 2, "number_of_characters": 52, + "num_texts_in_train": 1, + "min_text_length": 23, "average_text_length": 26.0, + "max_text_length": 29, + "unique_text": 2, "unique_labels": 2, "labels": {"0": {"count": 1}, "1": {"count": 1}}, }, }, - } + }, + "train": { + "num_samples": 4, + "number_of_characters": 106, + "num_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 2}, "1": {"count": 2}}, + "hf_subset_descriptive_stats": { + "eng": { + "num_samples": 2, + "number_of_characters": 53, + "num_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 1}, "1": {"count": 1}}, + }, + "fra": { + "num_samples": 2, + "number_of_characters": 53, + "num_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 1}, "1": {"count": 1}}, + }, + }, + }, } metadata = TaskMetadata( @@ -124,18 +188,19 @@ class MockMultilingualClassificationTask(AbsTaskClassification, MultilingualTask metadata.eval_langs = multilingual_eval_langs def load_data(self, **kwargs): - texts = ["This is a test sentence", "This is another test sentence"] + train_texts = ["This is a test sentence", "This is another train sentence"] + test_texts = ["This is a test sentence", "This is another test sentence"] labels = [0, 1] data = { "test": Dataset.from_dict( { - "text": texts, + "text": test_texts, "label": labels, } ), "train": Dataset.from_dict( { - "text": texts, + "text": train_texts, "label": labels, } ), @@ -153,10 +218,17 @@ def load_data(self, **kwargs): class MockBitextMiningTask(AbsTaskBitextMining): expected_stats = { "test": { - "average_sentence1_length": 26.0, - "average_sentence2_length": 30.5, "num_samples": 2, "number_of_characters": 113, + "unique_pairs": 2, + "min_sentence1_length": 23, + "average_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "average_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, } } @@ -190,22 +262,43 @@ def load_data(self, **kwargs): class MockMultilingualBitextMiningTask(AbsTaskBitextMining, MultilingualTask): expected_stats = { "test": { - "average_sentence1_length": 26.0, - "average_sentence2_length": 30.5, "num_samples": 4, "number_of_characters": 226, + "unique_pairs": 2, + "min_sentence1_length": 23, + "average_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "average_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, "hf_subset_descriptive_stats": { "eng": { - "average_sentence1_length": 26.0, - "average_sentence2_length": 30.5, "num_samples": 2, "number_of_characters": 113, - }, - "fra": { + "unique_pairs": 2, + "min_sentence1_length": 23, "average_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, "average_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, + }, + "fra": { "num_samples": 2, "number_of_characters": 113, + "unique_pairs": 2, + "min_sentence1_length": 23, + "average_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "average_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, }, }, } @@ -246,22 +339,43 @@ class MockMultilingualParallelBitextMiningTask(AbsTaskBitextMining, Multilingual parallel_subsets = True expected_stats = { "test": { - "average_sentence1_length": 28.25, - "average_sentence2_length": 28.25, "num_samples": 4, "number_of_characters": 226, + "unique_pairs": 4, + "min_sentence1_length": 23, + "average_sentence1_length": 28.25, + "max_sentence1_length": 37, + "unique_sentence1": 4, + "min_sentence2_length": 23, + "average_sentence2_length": 28.25, + "max_sentence2_length": 37, + "unique_sentence2": 4, "hf_subset_descriptive_stats": { "eng_Latn-fra_Latn": { - "average_sentence1_length": 26.0, - "average_sentence2_length": 30.5, "num_samples": 2, "number_of_characters": 113, + "unique_pairs": 2, + "min_sentence1_length": 23, + "average_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "average_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, }, "fra_Latn-eng_Latn": { - "average_sentence1_length": 30.5, - "average_sentence2_length": 26.0, "num_samples": 2, "number_of_characters": 113, + "unique_pairs": 2, + "min_sentence1_length": 24, + "average_sentence1_length": 30.5, + "max_sentence1_length": 37, + "unique_sentence1": 2, + "min_sentence2_length": 23, + "average_sentence2_length": 26.0, + "max_sentence2_length": 29, + "unique_sentence2": 2, }, }, } @@ -303,8 +417,13 @@ class MockClusteringTask(AbsTaskClustering): "test": { "num_samples": 1, "number_of_characters": 3, + "min_text_length": 3, "average_text_length": 3.0, + "max_text_length": 3, + "unique_texts": 3, + "min_labels_per_text": 1, "average_labels_per_text": 3.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, } @@ -345,24 +464,39 @@ class MockMultilingualClusteringTask(AbsTaskClustering, MultilingualTask): "test": { "num_samples": 2, "number_of_characters": 6, + "min_text_length": 3, "average_text_length": 3.0, + "max_text_length": 3, + "unique_texts": 3, + "min_labels_per_text": 2, "average_labels_per_text": 3.0, + "max_labels_per_text": 2, "unique_labels": 3, "labels": {"0": {"count": 2}, "1": {"count": 2}, "2": {"count": 2}}, "hf_subset_descriptive_stats": { "eng": { "num_samples": 1, "number_of_characters": 3, + "min_text_length": 3, "average_text_length": 3.0, + "max_text_length": 3, + "unique_texts": 3, + "min_labels_per_text": 1, "average_labels_per_text": 3.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, }, "fra": { "num_samples": 1, "number_of_characters": 3, + "min_text_length": 3, "average_text_length": 3.0, + "max_text_length": 3, + "unique_texts": 3, + "min_labels_per_text": 1, "average_labels_per_text": 3.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, }, @@ -412,8 +546,12 @@ class MockClusteringFastTask(AbsTaskClusteringFast): "test": { "num_samples": 3, "number_of_characters": 81, + "min_text_length": 23, "average_text_length": 27.0, + "max_text_length": 29, + "min_labels_per_text": 1, "average_labels_per_text": 1.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, } @@ -454,24 +592,36 @@ class MockMultilingualClusteringFastTask(AbsTaskClusteringFast, MultilingualTask "test": { "num_samples": 6, "number_of_characters": 162, + "min_text_length": 23, "average_text_length": 27.0, + "max_text_length": 29, + "min_labels_per_text": 2, "average_labels_per_text": 1.0, + "max_labels_per_text": 2, "unique_labels": 3, "labels": {"0": {"count": 2}, "1": {"count": 2}, "2": {"count": 2}}, "hf_subset_descriptive_stats": { "eng": { "num_samples": 3, "number_of_characters": 81, + "min_text_length": 23, "average_text_length": 27.0, + "max_text_length": 29, + "min_labels_per_text": 1, "average_labels_per_text": 1.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, }, "fra": { "num_samples": 3, "number_of_characters": 81, + "min_text_length": 23, "average_text_length": 27.0, + "max_text_length": 29, + "min_labels_per_text": 1, "average_labels_per_text": 1.0, + "max_labels_per_text": 1, "unique_labels": 3, "labels": {"0": {"count": 1}, "1": {"count": 1}, "2": {"count": 1}}, }, @@ -517,8 +667,14 @@ class MockPairClassificationTask(AbsTaskPairClassification): "test": { "num_samples": 2, "number_of_characters": 113, - "avg_sentence1_len": 26.0, - "avg_sentence2_len": 30.5, + "min_sentence1_length": 23, + "avg_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "avg_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, "unique_labels": 2, "labels": {"1": {"count": 1}, "0": {"count": 1}}, } @@ -562,24 +718,42 @@ class MockMultilingualPairClassificationTask( "test": { "num_samples": 4, "number_of_characters": 226, - "avg_sentence1_len": 26.0, - "avg_sentence2_len": 30.5, + "min_sentence1_length": 23, + "avg_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "avg_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, "unique_labels": 2, "labels": {"1": {"count": 2}, "0": {"count": 2}}, "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 113, - "avg_sentence1_len": 26.0, - "avg_sentence2_len": 30.5, + "min_sentence1_length": 23, + "avg_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "avg_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, "unique_labels": 2, "labels": {"1": {"count": 1}, "0": {"count": 1}}, }, "fra": { "num_samples": 2, "number_of_characters": 113, - "avg_sentence1_len": 26.0, - "avg_sentence2_len": 30.5, + "min_sentence1_length": 23, + "avg_sentence1_length": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, + "avg_sentence2_length": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, "unique_labels": 2, "labels": {"1": {"count": 1}, "0": {"count": 1}}, }, @@ -627,9 +801,17 @@ class MockSTSTask(AbsTaskSTS): "test": { "num_samples": 2, "number_of_characters": 113, + "min_sentence1_length": 23, "average_sentence1_len": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, "average_sentence2_len": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, + "min_score": 0, "avg_score": 0.5, + "max_score": 1, } } @@ -674,23 +856,47 @@ class MockMultilingualSTSTask(AbsTaskSTS, MultilingualTask): "test": { "num_samples": 4, "number_of_characters": 226, + "min_sentence1_length": 23, "average_sentence1_len": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, "average_sentence2_len": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, + "min_score": 0, "avg_score": 0.5, + "max_score": 1, "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 113, + "min_sentence1_length": 23, "average_sentence1_len": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, "average_sentence2_len": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, + "min_score": 0, "avg_score": 0.5, + "max_score": 1, }, "fra": { "num_samples": 2, "number_of_characters": 113, + "min_sentence1_length": 23, "average_sentence1_len": 26.0, + "max_sentence1_length": 29, + "unique_sentence1": 2, + "min_sentence2_length": 24, "average_sentence2_len": 30.5, + "max_sentence2_length": 37, + "unique_sentence2": 2, + "min_score": 0, "avg_score": 0.5, + "max_score": 1, }, }, } @@ -742,10 +948,21 @@ class MockSummarizationTask(AbsTaskSummarization): "test": { "num_samples": 2, "number_of_characters": 60, - "avg_text_len": 26.0, - "avg_human_summaries_len": 2.0, - "avg_machine_summaries_len": 2.0, + "min_text_length": 23, + "avg_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_human_summaries_length": 2, + "avg_human_summaries_length": 2.0, + "max_human_summaries_length": 2, + "unique_human_summaries": 2, + "min_machine_summaries_length": 2, + "avg_machine_summaries_length": 2.0, + "max_machine_summaries_length": 2, + "unique_machine_summaries": 2, + "min_relevance": [0, 1], "avg_relevance": 0.5, + "max_relevance": [1, 0], } } @@ -795,26 +1012,59 @@ class MockMultilingualSummarizationTask(AbsTaskSummarization, MultilingualTask): "test": { "num_samples": 4, "number_of_characters": 120, - "avg_text_len": 26.0, - "avg_human_summaries_len": 2.0, - "avg_machine_summaries_len": 2.0, + "min_text_length": 23, + "avg_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_human_summaries_length": 2, + "avg_human_summaries_length": 2.0, + "max_human_summaries_length": 2, + "unique_human_summaries": 2, + "min_machine_summaries_length": 2, + "avg_machine_summaries_length": 2.0, + "max_machine_summaries_length": 2, + "unique_machine_summaries": 2, + "min_relevance": [0, 1], "avg_relevance": 0.5, + "max_relevance": [1, 0], "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 60, - "avg_text_len": 26.0, - "avg_human_summaries_len": 2.0, - "avg_machine_summaries_len": 2.0, + "min_text_length": 23, + "avg_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_human_summaries_length": 2, + "avg_human_summaries_length": 2.0, + "max_human_summaries_length": 2, + "unique_human_summaries": 2, + "min_machine_summaries_length": 2, + "avg_machine_summaries_length": 2.0, + "max_machine_summaries_length": 2, + "unique_machine_summaries": 2, + "min_relevance": [0, 1], "avg_relevance": 0.5, + "max_relevance": [1, 0], }, "fra": { "num_samples": 2, "number_of_characters": 60, - "avg_text_len": 26.0, - "avg_human_summaries_len": 2.0, - "avg_machine_summaries_len": 2.0, + "min_text_length": 23, + "avg_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_human_summaries_length": 2, + "avg_human_summaries_length": 2.0, + "max_human_summaries_length": 2, + "unique_human_summaries": 2, + "min_machine_summaries_length": 2, + "avg_machine_summaries_length": 2.0, + "max_machine_summaries_length": 2, + "unique_machine_summaries": 2, + "min_relevance": [0, 1], "avg_relevance": 0.5, + "max_relevance": [1, 0], }, }, } @@ -872,9 +1122,18 @@ class MockRerankingTask(AbsTaskReranking): "number_of_characters": 172, "num_positive": 2, "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "min_query_length": 23, + "avg_query_length": 26.0, + "max_query_length": 29, + "unique_query": 2, + "min_positive_length": 27, + "avg_positive_length": 30.0, + "max_positive_length": 33, + "unique_positive": 2, + "min_negative_length": 27, + "avg_negative_length": 30.0, + "max_negative_length": 33, + "unique_negative": 2, } } @@ -917,27 +1176,54 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "number_of_characters": 344, "num_positive": 4, "num_negative": 4, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "min_query_length": 23, + "avg_query_length": 26.0, + "max_query_length": 29, + "unique_query": 2, + "min_positive_length": 27, + "avg_positive_length": 30.0, + "max_positive_length": 33, + "unique_positive": 2, + "min_negative_length": 27, + "avg_negative_length": 30.0, + "max_negative_length": 33, + "unique_negative": 2, "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 172, "num_positive": 2, "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "min_query_length": 23, + "avg_query_length": 26.0, + "max_query_length": 29, + "unique_query": 2, + "min_positive_length": 27, + "avg_positive_length": 30.0, + "max_positive_length": 33, + "unique_positive": 2, + "min_negative_length": 27, + "avg_negative_length": 30.0, + "max_negative_length": 33, + "unique_negative": 2, }, "fra": { "num_samples": 2, "number_of_characters": 172, "num_positive": 2, "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "min_query_length": 23, + "avg_query_length": 26.0, + "max_query_length": 29, + "unique_query": 2, + "min_positive_length": 27, + "avg_positive_length": 30.0, + "max_positive_length": 33, + "unique_positive": 2, + "min_negative_length": 27, + "avg_negative_length": 30.0, + "max_negative_length": 33, + "unique_negative": 2, }, }, } @@ -982,13 +1268,22 @@ def load_data(self, **kwargs): class MockRetrievalTask(AbsTaskRetrieval): expected_stats = { "test": { + "number_of_characters": 112, "num_samples": 4, - "number_of_characters": 56.0, - "average_document_length": 15.0, - "average_query_length": 13.0, - "num_documents": 2, "num_queries": 2, - "average_relevant_docs_per_query": 1.0, + "num_documents": 2, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 2, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 2, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 2, } } @@ -1025,31 +1320,58 @@ def load_data(self, **kwargs): class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): expected_stats = { "test": { - "number_of_characters": 56.0, + "number_of_characters": 224, "num_samples": 8, "num_queries": 4, "num_documents": 4, - "average_document_length": 7.5, - "average_query_length": 6.5, - "average_relevant_docs_per_query": 1.0, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 4, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 4, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 4, "hf_subset_descriptive_stats": { "eng": { - "number_of_characters": 56.0, + "number_of_characters": 112, "num_samples": 4, "num_queries": 2, "num_documents": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_relevant_docs_per_query": 1.0, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 2, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 2, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 2, }, "fra": { - "number_of_characters": 56.0, + "number_of_characters": 112, "num_samples": 4, "num_queries": 2, "num_documents": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_relevant_docs_per_query": 1.0, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 2, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 2, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 2, }, }, } @@ -1095,13 +1417,33 @@ def load_data(self, **kwargs): class MockMultilabelClassification(AbsTaskMultilabelClassification): expected_stats = { "test": { - "average_text_length": 26.0, + "num_samples": 6, "number_of_characters": 156, + "number_texts_in_train": 1, + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_labels_per_text": 2, "average_label_per_text": 2.0, + "max_labels_per_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 6}, "1": {"count": 6}}, + }, + "train": { "num_samples": 6, + "number_of_characters": 159, + "number_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_texts": 2, + "min_labels_per_text": 2, + "average_label_per_text": 2.0, + "max_labels_per_text": 2, "unique_labels": 2, "labels": {"0": {"count": 6}, "1": {"count": 6}}, - } + }, } metadata = TaskMetadata( @@ -1112,20 +1454,21 @@ class MockMultilabelClassification(AbsTaskMultilabelClassification): ) def load_data(self, **kwargs): - texts = ["This is a test sentence", "This is another test sentence"] * 3 + train_texts = ["This is a test sentence", "This is another train sentence"] * 3 + test_texts = ["This is a test sentence", "This is another test sentence"] * 3 labels = [[0, 1], [1, 0]] * 3 self.dataset = DatasetDict( { "test": Dataset.from_dict( { - "text": texts, + "text": test_texts, "label": labels, } ), "train": Dataset.from_dict( { - "text": texts, + "text": train_texts, "label": labels, } ), @@ -1139,31 +1482,93 @@ class MockMultilingualMultilabelClassification( ): expected_stats = { "test": { - "average_text_length": 26.0, + "num_samples": 12, "number_of_characters": 312, + "number_texts_in_train": 1, + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_labels_per_text": 2, "average_label_per_text": 2.0, - "num_samples": 12, + "max_labels_per_text": 2, "unique_labels": 2, "labels": {"0": {"count": 12}, "1": {"count": 12}}, "hf_subset_descriptive_stats": { "eng": { - "average_text_length": 26.0, + "num_samples": 6, "number_of_characters": 156, + "number_texts_in_train": 1, + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_labels_per_text": 2, "average_label_per_text": 2.0, - "num_samples": 6, + "max_labels_per_text": 2, "unique_labels": 2, "labels": {"0": {"count": 6}, "1": {"count": 6}}, }, "fra": { - "average_text_length": 26.0, + "num_samples": 6, "number_of_characters": 156, + "number_texts_in_train": 1, + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + "min_labels_per_text": 2, + "average_label_per_text": 2.0, + "max_labels_per_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 6}, "1": {"count": 6}}, + }, + }, + }, + "train": { + "num_samples": 12, + "number_of_characters": 318, + "number_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_texts": 2, + "min_labels_per_text": 2, + "average_label_per_text": 2.0, + "max_labels_per_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 12}, "1": {"count": 12}}, + "hf_subset_descriptive_stats": { + "eng": { + "num_samples": 6, + "number_of_characters": 159, + "number_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_texts": 2, + "min_labels_per_text": 2, "average_label_per_text": 2.0, + "max_labels_per_text": 2, + "unique_labels": 2, + "labels": {"0": {"count": 6}, "1": {"count": 6}}, + }, + "fra": { "num_samples": 6, + "number_of_characters": 159, + "number_texts_in_train": None, + "min_text_length": 23, + "average_text_length": 26.5, + "max_text_length": 30, + "unique_texts": 2, + "min_labels_per_text": 2, + "average_label_per_text": 2.0, + "max_labels_per_text": 2, "unique_labels": 2, "labels": {"0": {"count": 6}, "1": {"count": 6}}, }, }, - } + }, } metadata = TaskMetadata( @@ -1175,19 +1580,20 @@ class MockMultilingualMultilabelClassification( metadata.eval_langs = multilingual_eval_langs def load_data(self, **kwargs): - texts = ["This is a test sentence", "This is another test sentence"] * 3 + train_texts = ["This is a test sentence", "This is another train sentence"] * 3 + test_texts = ["This is a test sentence", "This is another test sentence"] * 3 labels = [[0, 1], [1, 0]] * 3 data = { "test": Dataset.from_dict( { - "text": texts, + "text": test_texts, "label": labels, } ), "train": Dataset.from_dict( { - "text": texts, + "text": train_texts, "label": labels, } ), @@ -1206,16 +1612,32 @@ class MockInstructionRetrival(AbsTaskInstructionRetrieval): do_length_ablation = True expected_stats = { "test": { - "average_changed_instruction_length": 37.0, + "num_samples": 4, + "num_docs": 2, + "num_queries": 2, + "number_of_characters": 244, + "min_document_length": 27, "average_document_length": 30.0, - "average_instruction_length": 29.0, + "max_document_length": 33, + "unique_docs": 2, + "min_query_length": 23, "average_query_length": 26.0, + "max_query_length": 29, + "unique_queries": 2, + "min_instruction_length": 26, + "average_instruction_length": 29.0, + "max_instruction_length": 32, + "unique_instructions": 2, + "min_changed_instruction_length": 34, + "average_changed_instruction_length": 37.0, + "max_changed_instruction_length": 40, + "unique_changed_instructions": 2, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_average_relevant_docs_per_query": 1, + "min_average_top_ranked_per_query": 2, "average_top_ranked_per_query": 2.0, - "num_docs": 2, - "num_queries": 2, - "num_samples": 4, - "number_of_characters": 244, + "max_average_top_ranked_per_query": 2, } } @@ -1297,36 +1719,84 @@ class MockMultilingualInstructionRetrival( "num_docs": 4, "num_queries": 4, "number_of_characters": 488, + "min_document_length": 27, "average_document_length": 30.0, + "max_document_length": 33, + "unique_docs": 2, + "min_query_length": 23, "average_query_length": 26.0, + "max_query_length": 29, + "unique_queries": 2, + "min_instruction_length": 26, "average_instruction_length": 29.0, + "max_instruction_length": 32, + "unique_instructions": 2, + "min_changed_instruction_length": 34, "average_changed_instruction_length": 37.0, + "max_changed_instruction_length": 40, + "unique_changed_instructions": 2, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_average_relevant_docs_per_query": 1, + "min_average_top_ranked_per_query": 2, "average_top_ranked_per_query": 2.0, + "max_average_top_ranked_per_query": 2, "hf_subset_descriptive_stats": { "eng": { "num_samples": 4, "num_docs": 2, "num_queries": 2, "number_of_characters": 244, + "min_document_length": 27, "average_document_length": 30.0, + "max_document_length": 33, + "unique_docs": 2, + "min_query_length": 23, "average_query_length": 26.0, + "max_query_length": 29, + "unique_queries": 2, + "min_instruction_length": 26, "average_instruction_length": 29.0, + "max_instruction_length": 32, + "unique_instructions": 2, + "min_changed_instruction_length": 34, "average_changed_instruction_length": 37.0, + "max_changed_instruction_length": 40, + "unique_changed_instructions": 2, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_average_relevant_docs_per_query": 1, + "min_average_top_ranked_per_query": 2, "average_top_ranked_per_query": 2.0, + "max_average_top_ranked_per_query": 2, }, "fra": { "num_samples": 4, "num_docs": 2, "num_queries": 2, "number_of_characters": 244, + "min_document_length": 27, "average_document_length": 30.0, + "max_document_length": 33, + "unique_docs": 2, + "min_query_length": 23, "average_query_length": 26.0, + "max_query_length": 29, + "unique_queries": 2, + "min_instruction_length": 26, "average_instruction_length": 29.0, + "max_instruction_length": 32, + "unique_instructions": 2, + "min_changed_instruction_length": 34, "average_changed_instruction_length": 37.0, + "max_changed_instruction_length": 40, + "unique_changed_instructions": 2, + "min_average_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 1.0, + "max_average_relevant_docs_per_query": 1, + "min_average_top_ranked_per_query": 2, "average_top_ranked_per_query": 2.0, + "max_average_top_ranked_per_query": 2, }, }, } diff --git a/tests/test_tasks/test_metadata.py b/tests/test_tasks/test_metadata.py index 1e7e1b24d..3d206da5c 100644 --- a/tests/test_tasks/test_metadata.py +++ b/tests/test_tasks/test_metadata.py @@ -8,9 +8,9 @@ @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) def test_descriptive_stats(task): result_stat = task.calculate_metadata_metrics() + # remove descriptive task file + task.metadata.descriptive_stat_path.unlink() task_stat = task.expected_stats for key, value in result_stat.items(): assert key in task_stat assert value == task_stat[key] - # remove descriptive task file - task.metadata.descriptive_stat_path.unlink()