Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support new accelerate api #33

Draft
wants to merge 22 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Lastly, if you want to study the effect of multitask prompted training (a.k.a. i
- T-Zero ++: https://huggingface.co/bigscience/T0pp
- T-Zero Single Prompt: https://huggingface.co/bigscience/T0_single_prompt
- T-Zero Original Task Only: https://huggingface.co/bigscience/T0_original_task_only
- T-Zero 3B: https://huggingface.co/bigscience/T0_3B
- T-Zero 3B: https://huggingface.co/bigscience/T0_3Bx§

## Citation

Expand Down
8 changes: 8 additions & 0 deletions debug/results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"dataset_name": "super_glue",
"dataset_config_name": "cb",
"template_name": "GPT-3 style",
"evaluation": {
"accuracy": 0.39285714285714285
}
}
258 changes: 145 additions & 113 deletions evaluation/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def parse_args():
parser.add_argument(
"--dataset_name",
type=str,
default=None,
help="The name of the dataset to use (via the datasets library).",
required=True,
)
Expand All @@ -61,12 +60,17 @@ def parse_args():
default=None,
help="The configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--template_config_name",
type=str,
default=None,
help="The name of the dataset_config_name of the template we want to use, example: use XNLI En prompts for XNLI Fr",
)
parser.add_argument(
"--template_name",
type=str,
default=None,
help="The template/prompt name",
required=True,
help="The template/prompt name. If None, we run all templates.",
)
parser.add_argument(
"--max_length",
Expand Down Expand Up @@ -128,115 +132,38 @@ def parse_args():
action="store_true",
help="Activate debug mode and run training only with a subset of data.",
)
parser.add_argument(
"--parallelize",
action="store_true",
help=(
"If passed, will call `model.parallelize` which splits the model on all GPUs available when applicable (model parallelism). "
"Note that this feature is still experimental in HF Transformers."
),
)
args = parser.parse_args()

return args

args = parser.parse_args()

def main():
args = parse_args()
# TODO @thomasw21 hack!
if args.dataset_config_name == "None":
args.dataset_config_name = None

# Initialize the accelerator. We will let the accelerator handle device placement for us.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
return args

def run_template(template_name, prompts, model, tokenizer, raw_datasets, accelerator: Accelerator, args):

# Handle the output directory creation
if accelerator.is_main_process:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
if args.dataset_name == "anli":
raw_datasets = load_dataset(args.dataset_name, split=args.dataset_config_name)
else:
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, split="validation")
#TODO(Victor): enable loading pre-processed dataset from https://huggingface.co/datasets/bigscience/P3

# Trim a number of evaluation examples
if args.debug:
raw_datasets = raw_datasets.select(range(min(len(raw_datasets),100)))

column_names = raw_datasets.column_names


# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
raise ValueError(
"Either `args.config_name` or `args.model_name_or_path` should be provided."
)

if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
result_dir = None
if args.output_dir is not None and accelerator.is_main_process:
paths = [
args.dataset_name,
args.dataset_config_name,
template_name,
]
result_dir = os.path.join(
args.output_dir,
*[path.replace(" ", "_") for path in paths if path is not None]
)
os.makedirs(result_dir, exist_ok=True)

if tokenizer.pad_token is None:
for token in [tokenizer.eos_token, tokenizer.bos_token, tokenizer.sep_token]:
if token is not None:
tokenizer.pad_token = token
if tokenizer.pad_token is None:
raise ValueError("Please define a pad token id.")

template = prompts[template_name]

model = ModelBase.from_config(
config=config,
model_name_or_path=args.model_name_or_path,
parallelize=args.parallelize
)

# Preprocessing the datasets.
# First we tokenize all the texts.
padding = "max_length" if args.pad_to_max_length else False

# Get the prompt to apply and the possible targets.
# TODO(Victor): If pulling from pre-processed data, remove this logic.
prompts = DatasetTemplates(
f"{args.dataset_name}"
if args.dataset_config_name is None
else f"{args.dataset_name}/{args.dataset_config_name}"
)
template = prompts[args.template_name]

column_names = raw_datasets.column_names
def preprocess_function(examples):
bs = len(examples[column_names[0]])

Expand Down Expand Up @@ -265,8 +192,9 @@ def preprocess_function(examples):
tokenized_targets = [
tokenizer(
ans_choi,
padding=True,
max_length=args.target_max_length,
# padding is on the right here.
padding=False,
max_length=args.max_length,
truncation=True,
)
for ans_choi in answer_choices_texts
Expand Down Expand Up @@ -319,17 +247,16 @@ def preprocess_function(examples):

eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)


# Use the device given by the `accelerator` object.
if not args.parallelize:
model.to(accelerator.device)

# Prepare everything with our `accelerator`.
eval_dataloader = accelerator.prepare(eval_dataloader)


# Metrics
metric = load_metric("accuracy")
metric = load_metric(
"accuracy",
process_id=accelerator.process_index,
num_process=accelerator.num_processes,
experiment_id=f"{args.dataset_name}_{args.dataset_config_name}_{args.template_name}"
)

# Eval!
total_batch_size = args.per_device_eval_batch_size * accelerator.num_processes
Expand Down Expand Up @@ -359,14 +286,119 @@ def preprocess_function(examples):
results = {
"dataset_name": args.dataset_name,
"dataset_config_name": args.dataset_config_name,
"template_name": args.template_name,
"evaluation": eval_metric
"template_name": template_name,
"evaluation": eval_metric,
"arguments": str(args)
}
if accelerator.is_main_process:
if args.output_dir is not None:
with open(os.path.join(args.output_dir, "results.json"), "w") as f:
json.dump(results, f, indent=4)
if result_dir is not None:
with open(os.path.join(result_dir, "results.json"), "w") as f:
json.dump(results, f, indent=2)

def main():
args = parse_args()

# Initialize the accelerator. We will let the accelerator handle device placement for us.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
# Downloading and loading a dataset from the hub.
if args.dataset_name == "anli":
raw_datasets = load_dataset(args.dataset_name, split=args.dataset_config_name)
else:
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, split="validation")
#TODO(Victor): enable loading pre-processed dataset from https://huggingface.co/datasets/bigscience/P3

# Trim a number of evaluation examples
if args.debug:
raw_datasets = raw_datasets.select(range(min(len(raw_datasets),100)))

# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
raise ValueError(
"Either `args.config_name` or `args.model_name_or_path` should be provided."
)

if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer, padding_side="left")
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer, padding_side="left")
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)

if tokenizer.pad_token is None:
for token in [tokenizer.eos_token, tokenizer.bos_token, tokenizer.sep_token]:
if token is not None:
tokenizer.pad_token = token
if tokenizer.pad_token is None:
raise ValueError("Please define a pad token id.")


model = ModelBase.from_config(
config=config,
model_name_or_path=args.model_name_or_path
)
model = accelerator.prepare_model(model)

# Get the prompt to apply and the possible targets.
# TODO(Victor): If pulling from pre-processed data, remove this logic.

if args.dataset_config_name is None or args.template_config_name is None or args.dataset_name == "anli":
Copy link
Member Author

@thomasw21 thomasw21 Jul 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Woops merged too fast, this should've been

if (args.dataset_config_name is None and args.template_config_name is None) or args.dataset_name == "anli"

cc @lintangsutawika

prompt_dataset_name = f"{args.dataset_name}"
elif args.template_config_name is not None:
prompt_dataset_name = f"{args.dataset_name}/{args.template_config_name}"
else:
prompt_dataset_name = f"{args.dataset_name}/{args.dataset_config_name}"

prompts = DatasetTemplates(
prompt_dataset_name
)

if args.template_name is not None:
template_names = [args.template_name]
else:
template_names = prompts.all_template_names

for template_name in template_names:
run_template(
template_name=template_name,
prompts=prompts,
model=model,
tokenizer=tokenizer,
raw_datasets=raw_datasets,
accelerator=accelerator,
args=args
)

if __name__ == "__main__":
main()
24 changes: 13 additions & 11 deletions t0/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def from_config(config, **kwargs) -> "ModelBase":
raise NotImplementedError

class EncoderDecoderModel(ModelBase):
def __init__(self, config, model_name_or_path: Optional[str], parallelize: bool, **kwargs):
def __init__(self, config, model_name_or_path: Optional[str], **kwargs):
"""

Args:
Expand All @@ -46,11 +46,9 @@ def __init__(self, config, model_name_or_path: Optional[str], parallelize: bool,
)
else:
logger.info("Training new model from scratch")
self._model = AutoModelForSeq2SeqLM.from_config(config)

if parallelize:
assert torch.cuda.is_available(), "You need at least 1 GPU to call `parallelize` (even though if there is only 1 GPU, there won't be any model parallelism)."
self._model.parallelize()
self._model = AutoModelForSeq2SeqLM.from_config(
config,
)


def forward(self, batch) -> torch.Tensor:
Expand Down Expand Up @@ -78,19 +76,23 @@ def __init__(self, config, model_name_or_path: Optional[str], **kwargs):
)
else:
logger.info("Training new model from scratch")
self._model = AutoModelForCausalLM.from_config(config)
self._model = AutoModelForCausalLM.from_config(
config,
)

def forward(self, batch):
device = batch["input_ids"].device
_, prefix_length = batch["input_ids"].shape

model_inputs = {
"input_ids": torch.cat([batch["input_ids"], batch["labels"]], dim=-1),
"attention_mask": torch.cat([batch["attention_mask"], batch["labels_attention_mask"]], dim=-1),
}
# Set position ids correctly to take care of padding tokens between inputs_ids and labels
# Empty attention_mask is a forbidden value, ie full of zeros. In fact the first element should be 1 as the input
# cannot be empty
assert torch.all(model_inputs["attention_mask"][:,0] == 1), "First element in the attention mask should be 1."
position_ids = torch.cumsum(model_inputs["attention_mask"].to(torch.long), dim=-1) - 1
position_ids = torch.maximum(
torch.cumsum(model_inputs["attention_mask"].to(torch.long), dim=-1) - 1,
torch.zeros(1, dtype=torch.long, device=device)[None, None]
)
model_inputs["position_ids"] = position_ids

logits = self._model(**model_inputs).logits[:, prefix_length-1:-1]
Expand Down