From 99b2c851255c8bad881300517a8feeb59cf072f1 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 31 May 2022 18:02:30 +0800 Subject: [PATCH] view --- models/cloudbrain.go | 14 +++++--- modules/grampus/grampus.go | 3 +- modules/grampus/resty.go | 2 +- options/locale/locale_en-US.ini | 5 +++ options/locale/locale_zh-CN.ini | 5 +++ routers/api/v1/api.go | 2 +- routers/repo/cloudbrain.go | 3 ++ routers/repo/grampus.go | 38 ++++++++++---------- routers/routes/routes.go | 2 +- templates/repo/cloudbrain/trainjob/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/gpu/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/npu/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/show.tmpl | 13 ++++++- templates/repo/modelarts/trainjob/new.tmpl | 13 +++++++ 14 files changed, 110 insertions(+), 29 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 06fbea5b3..97fa69e0d 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -147,6 +147,7 @@ type Cloudbrain struct { ComputeResource string //计算资源,例如npu EngineID int64 //引擎id ImageID string //grampus image_id + AiCenter string //grampus ai center: center_id+center_name TrainUrl string //输出模型的obs路径 BranchName string //分支名称 @@ -1224,11 +1225,14 @@ type GrampusStopJobResponse struct { } type GrampusTasks struct { - Command string `json:"command"` - Name string `json:"name"` - ImageId string `json:"imageId"` - ResourceSpecId string `json:"resourceSpecId"` - ImageUrl string `json:"imageUrl"` + Command string `json:"command"` + Name string `json:"name"` + ImageId string `json:"imageId"` + ResourceSpecId string `json:"resourceSpecId"` + ImageUrl string `json:"imageUrl"` + CenterID []string `json:"centerID"` + CenterName []string `json:"centerName"` + ReplicaNum int `json:"replicaNum"` } type CreateGrampusJobRequest struct { diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 95985e533..a0f398115 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -40,7 +40,7 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" + CommandPrepareScript = "cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" CodeArchiveName = "master.zip" ) @@ -92,6 +92,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error ResourceSpecId: req.ResourceSpecId, ImageId: req.ImageId, ImageUrl: req.ImageUrl, + ReplicaNum: 0, }, }, }) diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 183afb853..bd64ace8f 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -228,7 +228,7 @@ func GetTrainJobLog(jobID string) (string, error) { res, err := client.R(). SetAuthToken(TOKEN). SetResult(&logContent). - Get(HOST + urlTrainJob + "/" + jobID + "/log") + Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log") if err != nil { return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index c52a369ce..3166dafb7 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1170,6 +1170,8 @@ model.manage.sava_model = Sava Model model.manage.model_manage = ModelManage model.manage.model_accuracy = Model Accuracy +grampus.train_job.ai_center = AI Center + template.items = Template Items template.git_content = Git Content (Default Branch) template.git_hooks = Git Hooks @@ -3013,6 +3015,9 @@ Platform_Tutorial = Tutorial foot.advice_feedback = Feedback [cloudbrain] +resource_cluster = Resource Cluster +resource_cluster_openi = OpenI Resource Cluster +resource_cluster_c2net = China Computing NET compute_resource = Computing resources task_name = Task name task_type = Task type diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index cb1c7565a..e9b6a5280 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1180,6 +1180,8 @@ model.manage.sava_model = 保存模型 model.manage.model_manage = 模型管理 model.manage.model_accuracy = 模型精度 +grampus.train_job.ai_center=ai计算中心 + template.items=模板选项 template.git_content=Git数据(默认分支) template.git_hooks=Git 钩子 @@ -3023,6 +3025,9 @@ Platform_Tutorial=新手指引 foot.advice_feedback = 意见反馈 [cloudbrain] +resource_cluster = 算力集群 +resource_cluster_openi = 启智集群 +resource_cluster_c2net = 智算集群 compute_resource = 计算资源 task_name = 任务名称 task_type = 任务类型 diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 471f8be7e..f6153e811 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -935,9 +935,9 @@ func RegisterRoutes(m *macaron.Macaron) { }) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/grampus", func() { - m.Get("/:id", repo.GetCloudbrainTask) m.Group("/train-job", func() { m.Group("/:jobid", func() { + m.Get("", repo.GetModelArtsTrainJobVersion) m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob) m.Get("/log", repo_ext.GrampusGetLog) }) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 69cd24901..525dd07bb 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1477,6 +1477,9 @@ func SyncCloudbrainStatus() { } if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) task.Duration = result.JobInfo.RunSec task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 542b1c386..d25a75c68 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -9,7 +9,6 @@ import ( "code.gitea.io/gitea/modules/util" "encoding/json" "errors" - "fmt" "github.com/unknwon/com" "io/ioutil" "net/http" @@ -458,11 +457,14 @@ func GrampusTrainJobShow(ctx *context.Context) { result, err := grampus.GetJob(task.JobID) if err != nil { log.Error("GetJob failed:" + err.Error()) - ctx.ServerError("GetJob failed", err) - return + //ctx.ServerError("GetJob failed", err) + //return } if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning { task.Duration = result.JobInfo.RunSec @@ -508,6 +510,13 @@ func GrampusTrainJobShow(ctx *context.Context) { taskList = append(taskList, task) ctx.Data["version_list_task"] = taskList + ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) + + aiCenterInfo := strings.Split(task.AiCenter, "+") + if len(aiCenterInfo) == 2 { + ctx.Data["ai_center"] = aiCenterInfo[1] + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) } @@ -553,7 +562,6 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile toolUnzip = "tar -zxvf " } commandUnzip := "cd /tmp/dataset;" + toolUnzip + datasetName + ";cd /tmp/code;unzip -q master.zip;" - commandUnzip += "cd /tmp/dataset/" + strings.TrimSuffix(datasetName, ".zip") + ";ls;" command += commandUnzip //exec code @@ -579,25 +587,17 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile commandCode := "cd /tmp/code/" + repoName + ";python " + bootFile + paramCode + ";" command += commandCode + //get exec result + commandGetRes := "result=$?;" + command += commandGetRes + //upload models commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_grampus " + setting.Bucket + " " + outputObsPath + " " + "/tmp/output/;" command += commandUpload - return command, nil -} - -func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, error) { - var command string - - command = "python;" - command += "from modelarts.session import Session \n" - command += fmt.Sprintf("session = Session(access_key='%s',secret_key='%s', project_id='%s', region_name='%s') \n", setting.AccessKeyID, setting.SecretAccessKey, setting.ProjectID, setting.Location) - - if util.IsDir(srcObsFile) { - command += fmt.Sprintf("session.obs.download_dir(src_obs_dir=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) - } else { - command += fmt.Sprintf("session.obs.download_file(src_obs_file=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) - } + //check exec result + commandCheckRes := " [[ result -eq 0 ]] && echo success || ls failed;" + command += commandCheckRes return command, nil } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 64566a1d7..ab9f12205 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1089,7 +1089,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) }) m.Group("/gpu", func() { m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew) diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index 39315cfad..4eff7c21c 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -82,6 +82,19 @@