You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
在ChatGLM3-6B的微调过程中,遇到如下报错(The following error occurs during the fine-tuning of ChatGLM3-6B)ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api'
#1308
Open
Scorponok31 opened this issue
Aug 5, 2024
· 0 comments
我分别用lora或者p-tunningV2微调,都会出现该报错,我使用的是官方demo进行的微调,单卡或者多卡都会出现这个问题,请各位帮忙检查这个错误到底如何修正?Linux服务器,显卡为2张L40S
( fine tuning of ChatGLM3-6B, I encountered the following errors when I used lora or p-tunningV2 for fine tuning respectively. I used the official demo for fine tuning, and this problem would occur on either single card or multi-card. Please help to check how to correct this error. Linux server with two L40S graphics cards.)
The text was updated successfully, but these errors were encountered:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/chatglm3/ChatGLM3/finetune_demo/finetune_h │
│ f.py:537 in main │
│ │
│ 534 │ ) │
│ 535 │ │
536 │ if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None: │
│ ❱ 537 │ │ trainer.train() │
│ 538 │ else: │
│ 539 │ │ def do_rf_checkpoint(sn): │
│ 540 │ │ │ model.gradient_checkpointing_enable() │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:1885 in train │
│ │
│ 1882 │ │ │ finally: │
│ 1883 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1884 │ │ else: │
│ ❱ 1885 │ │ │ return inner_training_loop( │
│ 1886 │ │ │ │ args=args, │
│ 1887 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1888 │ │ │ │ trial=trial, │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:2022 in _inner_training_loop │
│ │
│ 2019 │ │ │ │
│ 2020 │ │ │ self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradi │
│ 2021 │ │ │
│ ❱ 2022 │ │ model = self._wrap_model(self.model_wrapped) │
│ 2023 │ │ │
│ 2024 │ │ # as the model is wrapped, don't use
accelerator.prepare
││ 2025 │ │ # this is for unhandled cases such as │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:1640 in _wrap_model │
│ │
│ 1637 │ │ │ return smp.DistributedModel(model, backward_passes_per_step=self.args.gradie │
│ 1638 │ │ │
│ 1639 │ │ # train/eval could be run multiple-times - if already wrapped, don't re-wrap it │
│ ❱ 1640 │ │ if self.accelerator.unwrap_model(model) is not model: │
│ 1641 │ │ │ return model │
│ 1642 │ │ │
│ 1643 │ │ # Mixed precision training with apex (torch < 1.6) │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/accelerate/ac │
│ celerator.py:2540 in unwrap_model │
│ │
│ 2537 │ │ MyModel │
│ 2538 │ │ ``` │
│ 2539 │ │ """ │
│ ❱ 2540 │ │ return extract_model_from_parallel(model, keep_fp32_wrapper) │
│ 2541 │ │
│ 2542 │ def wait_for_everyone(self): │
│ 2543 │ │ """ │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/accelerate/ut │
│ ils/other.py:80 in extract_model_from_parallel │
│ │
│ 77 │ │ model = model._orig_mod │
│ 78 │ │
│ 79 │ if is_deepspeed_available(): │
│ ❱ 80 │ │ from deepspeed import DeepSpeedEngine │
│ 81 │ │ │
│ 82 │ │ options += (DeepSpeedEngine,) │
│ 83 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/i │
│ nit.py:22 in │
│ │
│ 19 │ HAS_TRITON = False │
│ 20 │
│ 21 from . import ops │
│ ❱ 22 from . import module_inject │
│ 23 │
│ 24 from .accelerator import get_accelerator │
│ 25 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedu │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/mod │
│ ule_inject/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from .replace_module import replace_transformer_layer, revert_transformer_layer, Replace │
│ 7 from .module_quantize import quantize_transformer_layer │
│ 8 from .replace_policy import HFBertLayerPolicy │
│ 9 from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/mod │
│ ule_inject/replace_module.py:607 in │
│ │
│ 604 │ return replaced_module │
│ 605 │
│ 606 │
│ ❱ 607 from ..pipe import PipelineModule │
│ 608 │
│ 609 import re │
│ 610 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/pip │
│ e/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec │
│ 7 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/pipe/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from .module import PipelineModule, LayerSpec, TiedLayerSpec │
│ 7 from .topology import ProcessTopology │
│ 8 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/pipe/module.py:19 in │
│ │
│ 16 │
│ 17 from deepspeed.utils import logger │
│ 18 from .. import utils as ds_utils │
│ ❱ 19 from ..activation_checkpointing import checkpointing │
│ 20 from .topology import PipeDataParallelTopology, PipelineParallelGrid │
│ 21 from deepspeed.runtime.state_dict_factory import SDLoaderFactory │
│ 22 from deepspeed.accelerator import get_accelerator │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/activation_checkpointing/checkpointing.py:26 in │
│ │
│ 23 import mmap │
│ 24 from torch import _C │
│ 25 │
│ ❱ 26 from deepspeed.runtime.config import DeepSpeedConfig │
│ 27 from deepspeed.utils import logger │
│ 28 from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bw │
│ 29 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers, FORWARD_GLOBAL_T │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/config.py:41 in │
│ │
│ 38 from ..git_version_info import version as version │
│ 39 from ..utils import logger │
│ 40 │
│ ❱ 41 from ..elasticity import ( │
│ 42 │ elasticity_enabled, │
│ 43 │ compute_elastic_config, │
│ 44 │ ensure_immutable_elastic_config, │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/ela │
│ sticity/init.py:10 in │
│ │
│ 7 from .utils import is_torch_elastic_compatible │
│ 8 from .constants import ENABLED, ENABLED_DEFAULT, ELASTICITY │
│ 9 if is_torch_elastic_compatible(): │
│ ❱ 10 │ from .elastic_agent import DSElasticAgent │
│ 11 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/ela │
│ sticity/elastic_agent.py:9 in │
│ │
│ 6 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent │
│ 7 from typing import Any, Dict, Optional, Tuple │
│ 8 from datetime import datetime │
│ ❱ 9 from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port │
│ 10 from torch.distributed.elastic.metrics import put_metric │
│ 11 from torch.distributed.elastic.agent.server.api import ( │
│ 12 │ RunResult, │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api'
ComplelteError.txt
我分别用lora或者p-tunningV2微调,都会出现该报错,我使用的是官方demo进行的微调,单卡或者多卡都会出现这个问题,请各位帮忙检查这个错误到底如何修正?Linux服务器,显卡为2张L40S
( fine tuning of ChatGLM3-6B, I encountered the following errors when I used lora or p-tunningV2 for fine tuning respectively. I used the official demo for fine tuning, and this problem would occur on either single card or multi-card. Please help to check how to correct this error. Linux server with two L40S graphics cards.)
The text was updated successfully, but these errors were encountered: