#34766IsADirectoryError when training with tqdm enabled for trainer
Issue Details
Author
System Info
Error info:
**IsADirectoryError**: [Errno 21] Is a directory: '\n <div>\n \n <progress value=\'2\' max=\'108\' style=\'width:300px; height:20px; vertical-align: middle;\'></progress>\n [ 2/108 : < :, Epoch 0.04/4]\n </div>\n <table border="1" class="dataframe">\n <thead>\n <tr style="text-align: left;">\n <th>Step</th>\n <th>Training Loss</th>\n <th>Validation Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>'
Code:
training_args = transformers.TrainingArguments( num_train_epochs=4, # Number of training epochs per_device_train_batch_size=batch_size, # Batch size for training per_device_eval_batch_size=batch_size, # Batch size for evaluation gradient_accumulation_steps=2, # Number of steps to accumulate gradients before updating gradient_checkpointing=True, # Enable gradient checkpointing to save memory do_eval=True, # Perform evaluation during training save_total_limit=2, # Limit the total number of saved checkpoints evaluation_strategy="steps", # Evaluation strategy to use (here, at each specified number of steps) save_strategy="steps", # Save checkpoints at each specified number of steps save_steps=10, # Number of steps between each checkpoint save eval_steps=10, # Number of steps between each evaluation max_grad_norm=1, # Maximum gradient norm for clipping warmup_ratio=0.1, # Warmup ratio for learning rate schedule weight_decay=0.001, # Regularization technique to prevent overfitting # fp16=True, # Enable mixed precision training with fp16 (enable it if Ampere architecture is unavailable) bf16=True, # Enable mixed precision training with bf16 logging_steps=10, # Number of steps between each log output_dir="outputs", # Directory to save the model outputs and checkpoints optim="adamw_torch", # Optimizer to use (AdamW with PyTorch) learning_rate=5e-5, # Learning rate for the optimizer lr_scheduler_type="linear", # Learning rate scheduler type: constant load_best_model_at_end=True, # Load the best model found during training at the end metric_for_best_model="rouge", # Metric used to determine the best model greater_is_better=True, # Indicates if a higher metric score is better push_to_hub=False, # Whether to push the model to Hugging Face Hub run_name="finetuning", # Name of the run for experiment tracking report_to="wandb" # For experiment tracking (login to Weights & Biases needed) ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, ) trainer.train()
Env info: Jupyter version:
!jupyter --version IPython : 8.27.0 ipykernel : 6.29.5 ipywidgets : 7.7.1 jupyter_client : 7.4.9 jupyter_core : 5.7.2 jupyter_server : 2.14.2 jupyterlab : 4.0.11 nbclient : 0.10.0 nbconvert : 7.16.4 nbformat : 5.10.4 notebook : 6.5.7 qtconsole : 5.6.0 traitlets : 5.14.3
Python: 3.10.11 jupyter lab: 4.0.11 transformers: 4.45.2
Detailed errors:
IsADirectoryError Traceback (most recent call last) Cell In[28], line 1 ----> 1 trainer.train() File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/trainer.py:2052, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 2050 hf_hub_utils.enable_progress_bars() 2051 else: -> 2052 return inner_training_loop( 2053 args=args, 2054 resume_from_checkpoint=resume_from_checkpoint, 2055 trial=trial, 2056 ignore_keys_for_eval=ignore_keys_for_eval, 2057 ) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/trainer.py:2465, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 2463 self.state.global_step += 1 2464 self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch -> 2465 self.control = self.callback_handler.on_step_end(args, self.state, self.control) 2467 self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) 2468 else: File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/trainer_callback.py:494, in CallbackHandler.on_step_end(self, args, state, control) 493 def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): --> 494 return self.call_event("on_step_end", args, state, control) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/trainer_callback.py:516, in CallbackHandler.call_event(self, event, args, state, control, **kwargs) 514 def call_event(self, event, args, state, control, **kwargs): 515 for callback in self.callbacks: --> 516 result = getattr(callback, event)( 517 args, 518 state, 519 control, 520 model=self.model, 521 tokenizer=self.tokenizer, 522 optimizer=self.optimizer, 523 lr_scheduler=self.lr_scheduler, 524 train_dataloader=self.train_dataloader, 525 eval_dataloader=self.eval_dataloader, 526 **kwargs, 527 ) 528 # A Callback can skip the return of `control` if it doesn't change it. 529 if result is not None: File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/utils/notebook.py:307, in NotebookProgressCallback.on_step_end(self, args, state, control, **kwargs) 305 def on_step_end(self, args, state, control, **kwargs): 306 epoch = int(state.epoch) if int(state.epoch) == state.epoch else f"{state.epoch:.2f}" --> 307 self.training_tracker.update( 308 state.global_step + 1, 309 comment=f"Epoch {epoch}/{state.num_train_epochs}", 310 force_update=self._force_next_update, 311 ) 312 self._force_next_update = False File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/utils/notebook.py:143, in NotebookProgressBar.update(self, value, force_update, comment) 141 self.first_calls = self.warmup 142 self.wait_for = 1 --> 143 self.update_bar(value) 144 elif value <= self.last_value and not force_update: 145 return File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/utils/notebook.py:188, in NotebookProgressBar.update_bar(self, value, comment) 185 self.label += f", {1/self.average_time_per_item:.2f} it/s" 187 self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]" --> 188 self.display() File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/transformers/utils/notebook.py:229, in NotebookTrainingTracker.display(self) 227 self.html_code += self.child_bar.html_code 228 if self.output is None: --> 229 self.output = disp.display(disp.HTML(self.html_code), display_id=True) 230 else: 231 self.output.update(disp.HTML(self.html_code)) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/IPython/core/display.py:432, in HTML.__init__(self, data, url, filename, metadata) 430 if warn(): 431 warnings.warn("Consider using IPython.display.IFrame instead") --> 432 super(HTML, self).__init__(data=data, url=url, filename=filename, metadata=metadata) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File /anaconda/envs/azureml_py38_PT_TF/lib/python3.10/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import IsADirectoryError: [Errno 21] Is a directory: '\n <div>\n \n <progress value=\'2\' max=\'108\' style=\'width:300px; height:20px; vertical-align: middle;\'></progress>\n [ 2/108 : < :, Epoch 0.04/4]\n </div>\n <table border="1" class="dataframe">\n <thead>\n <tr style="text-align: left;">\n <th>Step</th>\n <th>Training Loss</th>\n <th>Validation Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>'
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
This can be reproduced by the following code:
import time import transformers from transformers.utils.notebook import NotebookProgressBar pbar = NotebookProgressBar(100) for val in range(100): pbar.update(val) time.sleep(0.07) pbar.update(100)
Expected behavior
Training with progress bar being updated: