微调示例运行问题 #1337

hgh-xxh · 2024-11-21T06:05:51Z

System Info / 系統信息

cuda =12.1 torch =2.3.0 transformers=4.40.0 ,显卡是Tesla V100-PCIE-32GB

Who can help? / 谁可以帮助到您？

Information / 问题信息

The official example scripts / 官方的示例脚本
My own modified scripts / 我自己修改的脚本和任务

Reproduction / 复现过程

╭──────────────────────────────────────────────────────────────────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────────────────────────────────────────────────────────────────╮
│ /home/cug100/d2l-zh/pytorch/hgh/pycharmProject/base-model-test/LLM/ChatGLM3-main/finetune_demo/finetune_hf.py:540 in main │
│ │
│ 537 │ ) │
│ 538 │ │
│ 539 │ if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None: │
│ ❱ 540 │ │ trainer.train() │
│ 541 │ else: │
│ 542 │ │ def do_rf_checkpoint(sn): │
│ 543 │ │ │ model.gradient_checkpointing_enable() │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/transformers/trainer.py:1859 in train │
│ │
│ 1856 │ │ │ finally: │
│ 1857 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1858 │ │ else: │
│ ❱ 1859 │ │ │ return inner_training_loop( │
│ 1860 │ │ │ │ args=args, │
│ 1861 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1862 │ │ │ │ trial=trial, │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/transformers/trainer.py:1960 in _inner_training_loop │
│ │
│ 1957 │ │ │ self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps= │
│ 1958 │ │ │
│ 1959 │ │ if not delay_optimizer_creation: │
│ ❱ 1960 │ │ │ self.create_optimizer_and_scheduler(num_training_steps=max_steps) │
│ 1961 │ │ │
│ 1962 │ │ self.state = TrainerState() │
│ 1963 │ │ self.state.is_hyper_param_search = trial is not None │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/transformers/trainer.py:992 in create_optimizer_and_scheduler │
│ │
│ 989 │ │ Trainer's init through optimizers, or subclass and override this method (or c │ │ 990 │ │ create_scheduler`) in a subclass. │
│ 991 │ │ """ │
│ ❱ 992 │ │ self.create_optimizer() │
│ 993 │ │ if IS_SAGEMAKER_MP_POST_1_10 and smp.state.cfg.fp16: │
│ 994 │ │ │ # If smp >= 1.10 and fp16 is enabled, we unwrap the optimizer │
│ 995 │ │ │ optimizer = self.optimizer.optimizer │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/transformers/trainer.py:1049 in create_optimizer │
│ │
│ 1046 │ │ │ if "optimizer_dict" in optimizer_kwargs: │
│ 1047 │ │ │ │ optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict") │
│ 1048 │ │ │ │
│ ❱ 1049 │ │ │ self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwa │
│ 1050 │ │ │ if optimizer_cls.name == "Adam8bit": │
│ 1051 │ │ │ │ import bitsandbytes │
│ 1052 │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/optim/adamw.py:53 in init │
│ │
│ 50 │ │ │ differentiable=differentiable, │
│ 51 │ │ │ fused=fused, │
│ 52 │ │ ) │
│ ❱ 53 │ │ super().init(params, defaults) │
│ 54 │ │ │
│ 55 │ │ if fused: │
│ 56 │ │ │ if differentiable: │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/optim/optimizer.py:284 in init │
│ │
│ 281 │ │ │ param_groups = [{'params': param_groups}] │
│ 282 │ │ │
│ 283 │ │ for param_group in param_groups: │
│ ❱ 284 │ │ │ self.add_param_group(cast(dict, param_group)) │
│ 285 │ │ │
│ 286 │ │ # Allows _cuda_graph_capture_health_check to rig a poor man's TORCH_WARN_ONCE in │
│ 287 │ │ # which I don't think exists │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_compile.py:22 in inner │
│ │
│ 19 │ │ │
│ 20 │ │ @functools.wraps(fn) │
│ 21 │ │ def inner(*args, **kwargs): │
│ ❱ 22 │ │ │ import torch._dynamo │
│ 23 │ │ │ │
│ 24 │ │ │ return torch._dynamo.disable(fn, recursive)(*args, **kwargs) │
│ 25 │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_dynamo/init.py:2 in │
│ │
│ 1 import torch │
│ ❱ 2 from . import convert_frame, eval_frame, resume_execution │
│ 3 from .backends.registry import list_backends, lookup_backend, register_backend │
│ 4 from .callback import callback_handler, on_compile_end, on_compile_start │
│ 5 from .code_context import code_context │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:40 in │
│ │
│ 37 from torch.utils._python_dispatch import _disable_current_modes │
│ 38 from torch.utils._traceback import format_traceback_short │
│ 39 │
│ ❱ 40 from . import config, exc, trace_rules │
│ 41 from .backends.registry import CompilerFn │
│ 42 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps │
│ 43 from .bytecode_transformation import ( │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_dynamo/trace_rules.py:50 in │
│ │
│ 47 from ..utils import _config_module │
│ 48 from .utils import getfile, hashable, NP_SUPPORTED_MODULES, unwrap_if_wrapper │
│ 49 │
│ ❱ 50 from .variables import ( │
│ 51 │ BuiltinVariable, │
│ 52 │ FunctorchHigherOrderVariable, │
│ 53 │ NestedUserFunctionVariable, │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_dynamo/variables/init.py:34 in │
│ │
│ 31 │ UserFunctionVariable, │
│ 32 │ UserMethodVariable, │
│ 33 ) │
│ ❱ 34 from .higher_order_ops import ( │
│ 35 │ FunctorchHigherOrderVariable, │
│ 36 │ TorchHigherOrderOperatorVariable, │
│ 37 ) │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_dynamo/variables/higher_order_ops.py:13 in │
│ │
│ 10 import torch._C │
│ 11 import torch.fx │
│ 12 import torch.nn │
│ ❱ 13 import torch.onnx.operators │
│ 14 from torch._dynamo.utils import deepcopy_to_fake_tensor, get_fake_value, get_real_value │
│ 15 from torch._dynamo.variables.base import VariableTracker │
│ 16 from torch._dynamo.variables.builtin import BuiltinVariable │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/onnx/init.py:46 in │
│ │
│ 43 │ unregister_custom_op_symbolic, │
│ 44 ) │
│ 45 │
│ ❱ 46 from ._internal.exporter import ( # usort:skip. needs to be last to avoid circular impo │
│ 47 │ DiagnosticOptions, │
│ 48 │ ExportOptions, │
│ 49 │ ONNXProgram, │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/onnx/_internal/exporter/init.py:13 in │
│ │
│ 10 ] │
│ 11 │
│ 12 from . import _testing as testing, _verification as verification │
│ ❱ 13 from ._analysis import analyze │
│ 14 from ._compat import export_compat │
│ 15 from ._core import export, exported_program_to_ir │
│ 16 from ._onnx_program import ONNXProgram │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/onnx/_internal/exporter/_analysis.py:14 in │
│ │
│ 11 from typing import TYPE_CHECKING │
│ 12 │
│ 13 import torch │
│ ❱ 14 import torch._export.serde.schema │
│ 15 from torch.export import graph_signature │
│ 16 from torch.onnx._internal.exporter import _dispatching, _registration │
│ 17 │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/_export/init.py:40 in │
│ │
│ 37 from torch._subclasses.functional_tensor import FunctionalTensor │
│ 38 from torch._utils_internal import log_export_usage │
│ 39 from torch.export._tree_utils import reorder_kwargs │
│ ❱ 40 from torch.export._unlift import _create_stateful_graph_module │
│ 41 from torch.export.dynamic_shapes import ( │
│ 42 │ _process_constraints, │
│ 43 │ _process_dynamic_shapes, │
│ │
│ /home/cug100/anaconda3/envs/hgh-nlp-envs-llm/lib/python3.10/site-packages/torch/export/_unlift.py:20 in │
│ │
│ 17 ) │
│ 18 │
│ 19 │
│ ❱ 20 @torch._dynamo.disable │
│ 21 def _check_input_constraints_pre_hook(self, *args, **kwargs): │
│ 22 │ flat_args_with_path, received_spec = pytree.tree_flatten_with_path(args) │
│ 23 │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: partially initialized module 'torch._dynamo' has no attribute 'disable' (most likely due to a circular import)

Expected behavior / 期待表现

在运行示例代码时，前面的部分“'...反': 54955 -> 54955
'差': 55342 -> 55342
'萌': 56842 -> 56842
'。': 31155 -> 31155
'': 2 -> 2...”同示例输出相同，但是这个之后会出现上述报错代码，不太清楚是什么原因引起的，已经按照要求安装python和相关包，版本保持一致

The text was updated successfully, but these errors were encountered:

hgh-xxh · 2024-11-21T06:17:16Z

您好，可以关闭本条issue，经过排查可能是torch的缓存问题，将torch重新卸载并清除缓存后解决了问题！

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

微调示例运行问题 #1337

微调示例运行问题 #1337

hgh-xxh commented Nov 21, 2024

hgh-xxh commented Nov 21, 2024

微调示例运行问题 #1337

微调示例运行问题 #1337

Comments

hgh-xxh commented Nov 21, 2024

System Info / 系統信息

Who can help? / 谁可以帮助到您？

Information / 问题信息

Reproduction / 复现过程

Expected behavior / 期待表现

hgh-xxh commented Nov 21, 2024