diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 17762e72b..78efdc1b2 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -494,6 +494,8 @@ type Config struct { UserCommand string `json:"user_command"` CreateVersion bool `json:"create_version"` Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` } type Parameter struct { @@ -509,7 +511,7 @@ type DataSource struct { } type Volumes struct { - Nfs Nfs `json:"nfs"` + Nfs Nfs `json:"nfs"` HostPath HostPath `json:"host_path"` } @@ -526,6 +528,10 @@ type HostPath struct { ReadOnly bool `json:"read_only"` } +type Flavor struct { + Code string `json:"code"` +} + type CreateTrainJobResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` @@ -539,6 +545,28 @@ type CreateTrainJobResult struct { VersionName string `json:"version_name"` } +type GetResourceSpecsResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + SpecTotalCount int `json:"spec_total_count"` + Specs []Specs `json:"specs"` +} + +type Specs struct { + ErrorCode string `json:"core"` + ErrorMsg string `json:"cpu"` + IsSuccess bool `json:"no_resource"` + JobName string `json:"gpu_type"` + JobID int64 `json:"spec_id"` + Status int `json:"gpu_num"` + ResourceID string `json:"spec_code"` + VersionName string `json:"storage"` + MaxNum int `json:"max_num"` + UnitNum int `json:"unit_num"` + InterfaceType int `json:"interface_type"` +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index ef0041e8a..951065059 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -21,7 +21,9 @@ type CreateModelArtsTrainJobForm struct { BootFile string `form:"boot_file" binding:"Required"` WorkServerNumber int `form:"work_server_number" binding:"Required"` EngineID int `form:"engine_id" binding:"Required"` + SpecID int `form:"spec_id" binding:"Required"` Flavor string `form:"flavor" binding:"Required"` + PoolID string `form:"pool_id" binding:"Required"` Description string `form:"description"` } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 053502353..8793f9b77 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -23,17 +23,17 @@ const ( FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" //train-job - ResourcePools = "{\"resource_pool\":[{\"id\":1, \"value\":\"专属资源池\"}]}" + ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + "]}" - FlavorInfos = "{\"flavor\":[{\"id\":1,\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + - "{\"id\":2,\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + - "{\"id\":3,\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + - "{\"id\":4,\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + + FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + + "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + + "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + + "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + "]}" CodePath = "/code/" OutputPath = "/output/" @@ -48,6 +48,9 @@ type GenerateTrainJobReq struct { BootFile string DataUrl string TrainUrl string + FlavorCode string + PoolID string + SpecID int64 WorkServerNumber int EngineID int64 } @@ -61,8 +64,8 @@ type VersionInfo struct { type Flavor struct { Info []struct { - ID int `json:"id"` - Value string `json:"value"` + Code string `json:"code"` + Value string `json:"value"` } `json:"flavor"` } @@ -75,8 +78,8 @@ type Engine struct { type ResourcePool struct { Info []struct { - ID int `json:"id"` - Value string `json:"value"` + ID string `json:"id"` + Value string `json:"value"` } `json:"resource_pool"` } @@ -130,10 +133,15 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { Config: models.Config{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, - BootFileUrl: req.CodeObsPath + req.BootFile, + BootFileUrl: req.BootFile, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, + PoolID: req.PoolID, + SpecID: req.SpecID, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, }, }) diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index b698c1d65..10267c549 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -24,6 +24,8 @@ const ( urlGetToken = "/v3/auth/tokens" urlNotebook = "/demanager/instances" urlTrainJob = "/training-jobs" + urlResourceSpecs = "/job/resource-specs" + errorCodeExceedLimit = "ModelArts.0118" ) func getRestyClient() *resty.Client { @@ -293,6 +295,8 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create client := getRestyClient() var result models.CreateTrainJobResult + log.Info("%+v",createJobParams) + retry := 0 sendjob: @@ -307,6 +311,8 @@ sendjob: return nil, fmt.Errorf("resty create train-job: %s", err) } + log.Info("", res.StatusCode(), res.Request.Body) + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { retry++ _ = getToken() @@ -320,3 +326,40 @@ sendjob: return &result, nil } + +func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetResourceSpecsResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) + return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) + } + + if !result.IsSuccess { + log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index abf684d43..b58398759 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -336,6 +336,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := form.BootFile + flavorCode := form.Flavor + poolID := form.PoolID + specID := form.SpecID repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath @@ -349,6 +352,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) } //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { + log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err) + ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form) + return + } + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) @@ -360,14 +369,18 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) DataUrl: dataPath, Description: description, CodeObsPath: codeObsPath, - BootFile: codeObsPath + "/" + bootFile, + BootFile: codeObsPath + bootFile, TrainUrl: outputObsPath, + FlavorCode: flavorCode, + PoolID: poolID, WorkServerNumber: workServerNumber, EngineID: int64(engineID), + SpecID: int64(specID), } err := modelarts.GenerateTrainJob(ctx, req) if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -408,7 +421,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { if file.IsDir() { input := &obs.PutObjectInput{} input.Bucket = setting.Bucket - input.Key = codePath + file.Name() + "/" + input.Key = parentDir + file.Name() + "/" _, err = storage.ObsCli.PutObject(input) if err != nil { log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) @@ -422,7 +435,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { } else { input := &obs.PutFileInput{} input.Bucket = setting.Bucket - input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() + input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name() input.SourceFile = codePath + file.Name() _, err = storage.ObsCli.PutFile(input) if err != nil { @@ -434,3 +447,16 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { return nil } + +func obsMkdir(dir string) error { + input := &obs.PutObjectInput{} + input.Bucket = setting.Bucket + input.Key = dir + _, err := storage.ObsCli.PutObject(input) + if err != nil { + log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) + return err + } + + return nil +} diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index 37deeed3c..d1aec576d 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -128,7 +128,7 @@
- {{range .attachments}} {{end}} @@ -151,7 +151,7 @@
- {{range .engine_versions}} {{end}} @@ -161,13 +161,13 @@
- +
- {{range .attachments}} {{end}} @@ -184,9 +184,9 @@

{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}

- {{range .resource_pools}} - + {{end}}
@@ -211,9 +211,9 @@
- {{range .flavor_infos}} - + {{end}}
@@ -232,9 +232,9 @@
-
+
- +