-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Add support for deepseek recipes #5011
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
_load_recipes_cfg, | ||
_configure_gpu_args, | ||
_configure_trainium_args, | ||
_get_trainining_recipe_gpu_model_name_and_script, | ||
) | ||
from sagemaker.modules.utils import _run_clone_command_silent | ||
from sagemaker.modules.configs import Compute | ||
|
@@ -178,3 +179,37 @@ def test_get_args_from_recipe_compute( | |
assert mock_gpu_args.call_count == 0 | ||
assert mock_trainium_args.call_count == 0 | ||
assert args is None | ||
|
||
@pytest.mark.parametrize( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Erick! Can you copy this test case/change to tests/unit/test_pytorch.py as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added some for pytorch as well |
||
"test_case", | ||
[ | ||
{ | ||
"model_type": "llama_v3", | ||
"script": "llama_pretrain.py", | ||
"model_base_name": "llama_v3", | ||
}, | ||
{ | ||
"model_type": "mistral", | ||
"script": "mistral_pretrain.py", | ||
"model_base_name": "mistral", | ||
}, | ||
{ | ||
"model_type": "deepseek_llamav3", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to be extra safe can we align on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would these have different training scripts? Or you mean just add a test case for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no im just saying instead I think it will be easier to extend support to the v3 and native r1 model from deepseek if we use |
||
"script": "deepseek_pretrain.py", | ||
"model_base_name": "deepseek", | ||
}, | ||
{ | ||
"model_type": "deepseek_qwenv2", | ||
"script": "deepseek_pretrain.py", | ||
"model_base_name": "deepseek", | ||
}, | ||
], | ||
) | ||
def test_get_trainining_recipe_gpu_model_name_and_script(test_case): | ||
model_type = test_case["model_type"] | ||
script = test_case["script"] | ||
model_base_name, script = _get_trainining_recipe_gpu_model_name_and_script( | ||
model_type, script | ||
) | ||
assert model_base_name == test_case["model_base_name"] | ||
assert script == test_case["script"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we remove this redundancy and hardcode it somewhere ?