From 10e261c2e8a760a3e849952c0be4061da269c000 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 25 May 2022 18:34:37 +0800 Subject: [PATCH] create --- models/cloudbrain.go | 28 +++ modules/auth/grampus.go | 22 +- modules/grampus/grampus.go | 48 ++--- modules/grampus/resty.go | 59 ++++-- routers/repo/grampus.go | 210 ++++++++----------- templates/repo/grampus/trainjob/npu/new.tmpl | 31 +-- 6 files changed, 199 insertions(+), 199 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 6a6645d6b..86af80235 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -146,6 +146,7 @@ type Cloudbrain struct { PreVersionName string //父版本名称 ComputeResource string //计算资源,例如npu EngineID int64 //引擎id + ImageID string //grampus image_id TrainUrl string //输出模型的obs路径 BranchName string //分支名称 @@ -1180,6 +1181,33 @@ type GrampusJobInfo struct { Tasks []GrampusTasks `json:"tasks"` } +type GrampusSpec struct { + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + ID string `json:"id"` + Name string `json:"name"` + ProcessorType string `json:"processorType"` +} + +type GetGrampusResourceSpecsResult struct { + GrampusResult + Infos []GrampusSpec `json:"resourceSpecs"` +} + +type GrampusImage struct { + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + ID string `json:"id"` + Name string `json:"name"` + ProcessorType string `json:"processorType"` +} + +type GetGrampusImagesResult struct { + GrampusResult + TotalSize int `json:"totalSize"` + Infos []GrampusImage `json:"images"` +} + type CreateGrampusJobResponse struct { GrampusResult JobInfo GrampusJobInfo `json:"otJob"` diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 2cfaf7006..b92d8d06d 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -6,16 +6,18 @@ import ( ) type CreateGrampusTrainJobForm struct { - DisplayJobName string `form:"display_job_name" binding:"Required"` - JobName string `form:"job_name" binding:"Required"` - Attachment string `form:"attachment" binding:"Required"` - BootFile string `form:"boot_file" binding:"Required"` - Flavor string `form:"flavor" binding:"Required"` - Params string `form:"run_para_list" binding:"Required"` - Description string `form:"description"` - BranchName string `form:"branch_name" binding:"Required"` - FlavorName string `form:"flaver_names" binding:"Required"` - EngineName string `form:"engine_names" binding:"Required"` + DisplayJobName string `form:"display_job_name" binding:"Required"` + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment" binding:"Required"` + BootFile string `form:"boot_file" binding:"Required"` + ImageID string `form:"image_id" binding:"Required"` + FlavorID string `form:"flavor" binding:"Required"` + Params string `form:"run_para_list" binding:"Required"` + Description string `form:"description"` + BranchName string `form:"branch_name" binding:"Required"` + FlavorName string `form:"flaver_names" binding:"Required"` + EngineName string `form:"engine_names" binding:"Required"` + WorkServerNumber int `form:"work_server_number" binding:"Required"` } func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 5f580189d..13280cac3 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -42,6 +42,9 @@ const ( SortByCreateTime = "create_time" ConfigTypeCustom = "custom" TotalVersionCount = 1 + + ProcessorTypeNPU = "npu.huawei.com/NPU" + ProcessorTypeGPU = "nvidia.com/gpu" ) var ( @@ -54,7 +57,7 @@ type GenerateTrainJobReq struct { JobName string Command string ResourceSpecId string - ImageUrl string + ImageUrl string //与image_id二选一,都有的情况下优先image_url ImageId string DisplayJobName string @@ -102,29 +105,26 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.JobInfo.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainGrampus, - //VersionID: jobResult.VersionID, - //VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: req.ComputeResource, - //EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - //LogUrl: req.LogUrl, - //FlavorCode: req.FlavorCode, + Status: TransTrainJobStatus(jobResult.JobInfo.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainGrampus, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: req.ComputeResource, + ImageID: req.ImageId, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + FlavorCode: req.ResourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 0d87f390d..183afb853 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -22,7 +22,8 @@ const ( urlGetToken = urlOpenApiV1 + "token" urlTrainJob = urlOpenApiV1 + "trainjob" - urlResourceSpecs = "/job/resource-specs" + urlGetResourceSpecs = urlOpenApiV1 + "resourcespec" + urlGetImages = urlOpenApiV1 + "image" urlTrainJobConfig = "/training-job-configs" errorCodeExceedLimit = "ModelArts.0118" @@ -155,43 +156,65 @@ sendjob: return &result, nil } -func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { +func GetResourceSpecs(processorType string) (*models.GetGrampusResourceSpecsResult, error) { checkSetting() client := getRestyClient() - var result models.GetResourceSpecsResult + var result models.GetGrampusResourceSpecsResult retry := 0 sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). + _, err := client.R(). SetAuthToken(TOKEN). SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + Get(HOST + urlGetResourceSpecs + "?processorType=" + processorType) if err != nil { return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + if result.ErrorCode == errorIllegalToken && retry < 1 { retry++ + log.Info("retry get token") _ = getToken() goto sendjob } - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + if result.ErrorCode != 0 { + log.Error("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetImages(processorType string) (*models.GetGrampusImagesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetGrampusImagesResult + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlGetImages + "?processorType=" + processorType) + + if err != nil { + return nil, fmt.Errorf("resty GetImages: %v", err) } - if !result.IsSuccess { - log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index bf07cb079..3f4b1361c 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -135,18 +135,13 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) } ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec - ctx.Data["params"] = "" - ctx.Data["branchName"] = ctx.Repo.BranchName - - ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath - ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled - - ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath - ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled - ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne - - ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode") + branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) + if err != nil { + log.Error("GetBranches error:", err) + } + ctx.Data["branches"] = branches + ctx.Data["branchName"] = ctx.Repo.BranchName return nil } @@ -170,51 +165,37 @@ func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error { //get valid dataset attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) if err != nil { - ctx.ServerError("GetAllUserAttachments failed:", err) - return err - } - ctx.Data["attachments"] = attachs - - //get valid resource specs - var resourcePools modelarts.ResourcePool - if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + log.Error("GetModelArtsTrainAttachments failed:", err.Error()) + } else { + ctx.Data["attachments"] = attachs } - ctx.Data["resource_pools"] = resourcePools.Info - var engines modelarts.Engine - if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get valid engines + images, err := grampus.GetImages(grampus.ProcessorTypeNPU) + if err != nil { + log.Error("GetResourceSpecs failed:", err.Error()) + } else { + ctx.Data["engine_versions"] = images.Infos } - ctx.Data["engines"] = engines.Info - var versionInfos modelarts.VersionInfo - if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get valid resource specs + specs, err := grampus.GetResourceSpecs(grampus.ProcessorTypeNPU) + if err != nil { + log.Error("GetResourceSpecs failed:", err.Error()) + } else { + ctx.Data["flavor_infos"] = specs.Infos } - ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get branches + branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) + if err != nil { + log.Error("GetBranches error:", err.Error()) + } else { + ctx.Data["branches"] = branches } - ctx.Data["flavor_infos"] = flavorInfos.Info - ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName - configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) - if err != nil { - ctx.ServerError("getConfigList failed:", err) - return err - } - ctx.Data["config_list"] = configList.ParaConfigs - ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo - return nil } @@ -246,10 +227,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion - FlavorName := form.FlavorName - VersionCount := modelarts.VersionCount - EngineName := form.EngineName + flavorName := form.FlavorName + versionCount := modelarts.VersionCount + engineName := form.EngineName + //check count limit count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -265,12 +247,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check param if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) return } + //check whether the task name in the project is duplicated tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) if err == nil { @@ -295,9 +279,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain os.RemoveAll(codeLocalPath) } - gitRepo, _ := git.OpenRepository(repo.RepoPath()) - commitID, _ := gitRepo.GetBranchCommitID(branchName) - if err := downloadCode(repo, codeLocalPath, branchName); err != nil { log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) @@ -321,7 +302,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { - // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) @@ -330,9 +310,9 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command //todo: download code, download dataset, unzip dataset, exec code, upload model + command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "") var parameters models.Parameters param := make([]models.Parameter, 0) - existDeviceTarget := false if len(params) != 0 { err := json.Unmarshal([]byte(params), ¶meters) if err != nil { @@ -343,63 +323,45 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } for _, parameter := range parameters.Parameter { - if parameter.Label == modelarts.DeviceTarget { - existDeviceTarget = true - } - if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { - param = append(param, models.Parameter{ - Label: parameter.Label, - Value: parameter.Value, - }) - } + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) } } - if !existDeviceTarget { - param = append(param, models.Parameter{ - Label: modelarts.DeviceTarget, - Value: modelarts.Ascend, - }) - } + param = append(param, models.Parameter{ + Label: modelarts.DeviceTarget, + Value: modelarts.Ascend, + }) + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ - JobName: jobName, - DisplayJobName: displayJobName, - ComputeResource: models.NPUResource, - Command: "echo test", - ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7", - ImageUrl: "", - ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95", - - DataUrl: dataPath, - Description: description, - CodeObsPath: codeObsPath, - BootFileUrl: codeObsPath + bootFile, - BootFile: bootFile, - //TrainUrl: outputObsPath, - //FlavorCode: flavorCode, - WorkServerNumber: 1, - //EngineID: int64(engineID), - //LogUrl: logObsPath, - //PoolID: poolID, - Uuid: uuid, - //Parameters: param, + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: models.NPUResource, + Command: command, + ResourceSpecId: form.FlavorID, + ImageUrl: "", + ImageId: form.ImageID, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + WorkServerNumber: form.WorkServerNumber, + Uuid: uuid, CommitID: commitID, IsLatestVersion: isLatestVersion, BranchName: branchName, Params: form.Params, - FlavorName: FlavorName, - EngineName: EngineName, - VersionCount: VersionCount, + FlavorName: flavorName, + EngineName: engineName, + VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, } - //将params转换Parameters.Parameter,出错时返回给前端 - var Parameters modelarts.Parameters - if err := json.Unmarshal([]byte(params), &Parameters); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return - } - err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) @@ -517,25 +479,6 @@ func GrampusTrainJobShow(ctx *context.Context) { task.DatasetName = attachment.Name } - if len(task.Parameters) > 0 { - var parameters models.Parameters - err := json.Unmarshal([]byte(task.Parameters), ¶meters) - if err != nil { - log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) - ctx.ServerError("system error", err) - return - } - - if len(parameters.Parameter) > 0 { - paramTemp := "" - for _, Parameter := range parameters.Parameter { - param := Parameter.Label + " = " + Parameter.Value + "; " - paramTemp = paramTemp + param - } - task.Parameters = paramTemp[:len(paramTemp)-2] - } - } - if task.DeletedAt.IsZero() { //normal record result, err := grampus.GetJob(task.JobID) if err != nil { @@ -565,6 +508,25 @@ func GrampusTrainJobShow(ctx *context.Context) { } } + if len(task.Parameters) > 0 { + var parameters models.Parameters + err := json.Unmarshal([]byte(task.Parameters), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) + ctx.ServerError("system error", err) + return + } + + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + "; " + paramTemp = paramTemp + param + } + task.Parameters = paramTemp[:len(paramTemp)-2] + } + } + taskList := make([]*models.Cloudbrain, 0) taskList = append(taskList, task) ctx.Data["version_list_task"] = taskList @@ -595,3 +557,13 @@ func GrampusGetLog(ctx *context.Context) { return } + +func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) { + var command string + //download code + //download dataset + //unzip dataset + //exec code + //upload models + return command, nil +} diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl index 6f5f5455f..9e5ba39bc 100755 --- a/templates/repo/grampus/trainjob/npu/new.tmpl +++ b/templates/repo/grampus/trainjob/npu/new.tmpl @@ -136,18 +136,10 @@
-
- -
-
- {{range .engine_versions}} - + {{end}} @@ -175,22 +167,6 @@ {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- {{if ne 0 (len .params)}} - {{range $k ,$v := .params}} -
-
- -
-
- -
- - - - -
- {{end}} - {{end}}
@@ -224,7 +200,7 @@
@@ -237,7 +213,6 @@