From 7fd4bef198c65fc95759ff6fe0a455ed2485adfa Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 18 Jul 2022 20:25:12 +0800 Subject: [PATCH] debug --- models/cloudbrain.go | 7 +- modules/modelarts/modelarts.go | 118 +++++++++++++++++++++++++------ modules/modelarts/resty.go | 3 +- routers/api/v1/repo/modelarts.go | 17 +---- routers/repo/cloudbrain.go | 34 +++------ routers/repo/modelarts.go | 29 +++----- 6 files changed, 121 insertions(+), 87 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index bc8eaa987..b3320c49d 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1275,7 +1275,7 @@ type JobVersionList struct { type GetTrainJobVersionListResult struct { ErrorResult - JobID string `json:"job_id"` + JobID int64 `json:"job_id"` JobName string `json:"job_name"` JobDesc string `json:"job_desc"` VersionCount int64 `json:"version_count"` @@ -2169,3 +2169,8 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { In("id", ids). Find(&cloudbrains) } + +func GetCloudbrainCountByJobName(jobName, jobType string) (int, error) { + count, err := x.Where("job_name = ? and job_type= ?", jobName, jobType).Count(new(Cloudbrain)) + return int(count), err +} diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 0c501861f..ab5070262 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -509,7 +509,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job RepoID: repo.ID, Type: models.TypeCloudBrainTwo, JobTypes: jobTypes, - JobName: req.JobName, + JobID: jobId, }) if err != nil { ctx.ServerError("Cloudbrain", err) @@ -519,10 +519,10 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job createTime := timeutil.TimeStampNow() task := &models.Cloudbrain{ - Status: string(models.ModelArtsTrainJobWaiting), + Status: models.JobStatusTemp, UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, - JobID: models.TempJobIdPrefix + jobId, + JobID: jobId, JobName: req.JobName, DisplayJobName: req.DisplayJobName, JobType: string(models.JobTypeTrain), @@ -846,17 +846,17 @@ func GetNotebookImageName(imageId string) (string, error) { } func HandleTrainJobInfo(task *models.Cloudbrain) error { - if strings.HasPrefix(task.JobID, models.TempJobIdPrefix) { + if isTempJob(task.JobID, task.Status) { if task.VersionCount > VersionCountOne { //multi version result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix)) if err != nil { - log.Error("GetTrainJobVersionList(%s) failed:%v", task.JobName, err) + log.Error("GetTrainJobVersionList failed:%v", err) return err } if result != nil { - if result.JobID == task.JobID && result.JobName == task.JobName { + if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName { if result.VersionCount == int64(task.VersionCount) { log.Info("find the record(%s)", task.DisplayJobName) task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus) @@ -871,13 +871,13 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) if err != nil { log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) - return err - } - err = models.DeleteCloudbrainTemp(temp) - if err != nil { - log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) - return err + } else { + err = models.DeleteCloudbrainTemp(temp) + if err != nil { + log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) + } } + return nil } else { log.Error("can not find the record(%s) until now", task.DisplayJobName) @@ -890,16 +890,14 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { //inference or one version result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName) if err != nil { - log.Error("GetTrainJobList(%s) failed:%v", task.DisplayJobName, err) + log.Error("GetTrainJobList failed:%v", err) return err } if result != nil { - isExist := false for _, job := range result.JobList { if task.JobName == job.JobName { log.Info("find the record(%s)", task.DisplayJobName) - isExist = true task.Status = TransTrainJobStatus(job.IntStatus) task.JobID = strconv.FormatInt(job.JobID, 10) @@ -921,14 +919,6 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { return nil } } - - //todo: move - if !isExist { - log.Error("can not find the record(%s) until now", task.DisplayJobName) - //temp.QueryTimes = temp.QueryTimes + 1 - } else { - log.Info("find the record(%s)", task.DisplayJobName) - } } } @@ -963,3 +953,85 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { return nil } + +func HandleNotebookInfo(task *models.Cloudbrain) error { + if isTempJob(task.JobID, task.Status) { + result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName) + if err != nil { + log.Error("GetNotebookList failed:%v", err) + return err + } + + if result != nil { + count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType) + if err != nil { + log.Error("GetCloudbrainCountByJobName failed:%v", err) + return err + } + + if len(result.NotebookList) == count { + if result.NotebookList[0].JobName == task.JobName { + log.Info("find the record(%s)", task.DisplayJobName) + task.Status = result.NotebookList[0].Status + task.JobID = result.NotebookList[0].JobID + + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + return err + } + temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) + if err != nil { + log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) + return err + } + err = models.DeleteCloudbrainTemp(temp) + if err != nil { + log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) + return err + } + return nil + } else { + log.Error("can not find the record(%s) until now", task.DisplayJobName) + } + } else { + log.Error("can not find the record(%s) until now", task.DisplayJobName) + } + } else { + log.Error("can not find the record(%s) until now", task.DisplayJobName) + } + } else { + //normal + result, err := GetNotebook2(task.JobID) + if err != nil { + log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) + return err + } + + if result != nil { + task.Status = result.Status + if task.StartTime == 0 && result.Lease.UpdateTime > 0 { + task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) + } + if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { + task.EndTime = timeutil.TimeStampNow() + } + task.CorrectCreateUnix() + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) + return err + } + } + } + + return nil +} + +func isTempJob(jobID, status string) bool { + if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp { + return true + } + return false +} diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index dd364ea44..babcff1aa 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -1379,7 +1379,7 @@ sendjob: return &result, nil } -func GetNotebookList(limit, page int, sortBy, order, searchContent, status string) (*models.GetNotebookListResult, error) { +func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) { checkSetting() client := getRestyClient() var result models.GetNotebookListResult @@ -1390,6 +1390,7 @@ sendjob: res, err := client.R(). SetQueryParams(map[string]string{ "limit": strconv.Itoa(limit), + "offset": strconv.Itoa(offset), "name": searchContent, "sort_key": sortBy, "sort_dir": order, diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 81f9bc03e..7f2b30d81 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -36,29 +36,16 @@ func GetModelArtsNotebook2(ctx *context.APIContext) { ctx.NotFound(err) return } - result, err := modelarts.GetNotebook2(job.JobID) + err = modelarts.HandleNotebookInfo(job) if err != nil { ctx.NotFound(err) return } - if job.StartTime == 0 && result.Lease.UpdateTime > 0 { - job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) - } - job.Status = result.Status - if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) { - job.EndTime = timeutil.TimeStampNow() - } - job.CorrectCreateUnix() - job.ComputeAndSetDuration() - err = models.UpdateJob(job) - if err != nil { - log.Error("UpdateJob failed:", err) - } ctx.JSON(http.StatusOK, map[string]interface{}{ "ID": ID, "JobName": job.JobName, - "JobStatus": result.Status, + "JobStatus": job.Status, }) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 6bd1c7009..c4ed6385a 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -368,7 +368,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } - func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) { ctx.Data["PageIsCloudBrain"] = true displayJobName := form.DisplayJobName @@ -489,6 +488,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job") } + /** 检查用户传输的参数是否符合专属资源池 */ @@ -1706,42 +1706,24 @@ func SyncCloudbrainStatus() { } } else if task.Type == models.TypeCloudBrainTwo { if task.JobType == string(models.JobTypeDebug) { - //result, err := modelarts.GetJob(task.JobID) - result, err := modelarts.GetNotebook2(task.JobID) + err := modelarts.HandleNotebookInfo(task) if err != nil { - log.Error("GetJob(%s) failed:%v", task.JobName, err) + log.Error("HandleNotebookInfo(%s) failed:%v", task.DisplayJobName, err) continue } - - if result != nil { - task.Status = result.Status - if task.StartTime == 0 && result.Lease.UpdateTime > 0 { - task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) - } - if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { - task.EndTime = timeutil.TimeStampNow() - } - task.CorrectCreateUnix() - task.ComputeAndSetDuration() - err = models.UpdateJob(task) - if err != nil { - log.Error("UpdateJob(%s) failed:%v", task.JobName, err) - continue - } - } } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) { err := modelarts.HandleTrainJobInfo(task) if err != nil { - log.Error("HandleTrainJobInfo(%s) failed:%v", task.JobName, err) + log.Error("HandleTrainJobInfo(%s) failed:%v", task.DisplayJobName, err) continue } } else { - log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType) + log.Error("task.JobType(%s) is error:%s", task.DisplayJobName, task.JobType) } } else if task.Type == models.TypeC2Net { result, err := grampus.GetJob(task.JobID) if err != nil { - log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) + log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) continue } @@ -1762,12 +1744,12 @@ func SyncCloudbrainStatus() { task.CorrectCreateUnix() err = models.UpdateJob(task) if err != nil { - log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) continue } } } else { - log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) + log.Error("task.Type(%s) is error:%d", task.DisplayJobName, task.Type) } } diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 446b32a0f..509df334c 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -263,28 +263,15 @@ func NotebookShow(ctx *context.Context) { return } - result, err := modelarts.GetNotebook2(task.JobID) - if err != nil { - ctx.Data["error"] = err.Error() - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil) - return - } - - if result != nil { - if task.DeletedAt.IsZero() { //normal record - if task.Status != result.Status { - task.Status = result.Status - models.ParseAndSetDurationFromModelArtsNotebook(result, task) - err = models.UpdateJob(task) - if err != nil { - ctx.Data["error"] = err.Error() - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil) - return - } - } - } else { //deleted record - + if task.DeletedAt.IsZero() { //normal record + err := modelarts.HandleNotebookInfo(task) + if err != nil { + ctx.Data["error"] = err.Error() + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil) + return } + } else { //deleted record + } datasetDownload := make([]models.DatasetDownload, 0)