成功解决ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api'
问题描述
Traceback (most recent call last):
File "/mnt/data/group/zzk/projects/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 18, in <module>
File "/mnt/data/group/zzk/projects/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 18, in <module>
from internvl.dist_utils import init_dist
File "/mnt/data/group/zzk/projects/InternVL/internvl_chat/internvl/dist_utils.py", line 6, in <module>
from internvl.dist_utils import init_dist
File "/mnt/data/group/zzk/projects/InternVL/internvl_chat/internvl/dist_utils.py", line 6, in <module>
import deepspeed
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/__init__.py", line 26, in <module>
import deepspeed
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/__init__.py", line 26, in <module>
from . import module_inject
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/module_inject/__init__.py", line 6, in <module>
from . import module_inject
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/module_inject/__init__.py", line 6, in <module>
from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/module_inject/replace_module.py", line 607, in <module>
from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/module_inject/replace_module.py", line 607, in <module>
from ..pipe import PipelineModule
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/pipe/__init__.py", line 6, in <module>
from ..pipe import PipelineModule
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/pipe/__init__.py", line 6, in <module>
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/pipe/__init__.py", line 6, in <module>
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/pipe/__init__.py", line 6, in <module>
from .module import PipelineModule, LayerSpec, TiedLayerSpec
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/pipe/module.py", line 19, in <module>
from .module import PipelineModule, LayerSpec, TiedLayerSpec
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/pipe/module.py", line 19, in <module>
from ..activation_checkpointing import checkpointing
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 26, in <module>
from ..activation_checkpointing import checkpointing
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 26, in <module>
from deepspeed.runtime.config import DeepSpeedConfig
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/config.py", line 42, in <module>
from deepspeed.runtime.config import DeepSpeedConfig
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/config.py", line 42, in <module>
from ..elasticity import (
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/elasticity/__init__.py", line 10, in <module>
from ..elasticity import (
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/elasticity/__init__.py", line 10, in <module>
from .elastic_agent import DSElasticAgent
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/elasticity/elastic_agent.py", line 9, in <module>
from .elastic_agent import DSElasticAgent
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/deepspeed/elasticity/elastic_agent.py", line 9, in <module>
from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port
ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api' (/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py)
from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port
ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api' (/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py)
E0821 17:06:14.563819 140359981750080 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 4258) of binary: /mnt/data/group/zzk/projects/envs/internvl/bin/python
Traceback (most recent call last):
File "/mnt/data/group/zzk/projects/envs/internvl/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.4.0', 'console_scripts', 'torchrun')())
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
return f(*args, **kwargs)
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/mnt/data/group/zzk/projects/envs/internvl/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
internvl/train/internvl_chat_finetune.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-08-21_17:06:14
host : dsw-79197-8465954549-np84d
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 4259)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-08-21_17:06:14
host : dsw-79197-8465954549-np84d
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 4258)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
解决方案一:
将deepspeed升级到 deepspeed==0.14.4
pip install deepspeed==0.14.4
解决方案二:
建议将“log”更改为“logger”。



♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠ ⊕ ♠