diff --git a/models/cloudbrain.go b/models/cloudbrain.go index bb1241247..dd3d3531c 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -31,6 +31,7 @@ const ( JobTypeBrainScore JobType = "BRAINSCORE" JobTypeTrain JobType = "TRAIN" + //notebook ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败 @@ -46,6 +47,30 @@ const ( ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除 ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中 ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败 + + //trainjob + ModelArtsTrainJobUnknown ModelArtsJobStatus = "UNKNOWN" //作业状态未知 + ModelArtsTrainJobInit ModelArtsJobStatus = "INIT" //作业初始化状态 + ModelArtsTrainJobImageCreating ModelArtsJobStatus = "IMAGE_CREATING" //作业镜像正在创建 + ModelArtsTrainJobImageFailed ModelArtsJobStatus = "IMAGE_FAILED" //作业镜像创建失败 + ModelArtsTrainJobSubmitTrying ModelArtsJobStatus = "SUBMIT_TRYING" //作业正在提交 + ModelArtsTrainJobSubmitFailed ModelArtsJobStatus = "SUBMIT_FAILED" //作业提交失败 + ModelArtsTrainJobDeleteFailed ModelArtsJobStatus = "DELETE_FAILED" //作业删除失败 + ModelArtsTrainJobWaiting ModelArtsJobStatus = "WAITING" //作业正在排队中 + ModelArtsTrainJobRunning ModelArtsJobStatus = "RUNNING" //作业正在运行中 + ModelArtsTrainJobKilling ModelArtsJobStatus = "KILLING" //作业正在取消 + ModelArtsTrainJobCompleted ModelArtsJobStatus = "COMPLETED" //作业已经完成 + ModelArtsTrainJobFailed ModelArtsJobStatus = "FAILED" //作业运行失败 + ModelArtsTrainJobKilled ModelArtsJobStatus = "KILLED" //作业取消成功 + ModelArtsTrainJobCanceled ModelArtsJobStatus = "CANCELED" //作业取消 + ModelArtsTrainJobLost ModelArtsJobStatus = "LOST" //作业丢失 + ModelArtsTrainJobScaling ModelArtsJobStatus = "SCALING" //作业正在扩容 + ModelArtsTrainJobSubmitModelFailed ModelArtsJobStatus = "SUBMIT_MODEL_FAILED" //提交模型失败 + ModelArtsTrainJobDeployServiceFailed ModelArtsJobStatus = "DEPLOY_SERVICE_FAILED" //部署服务失败 + ModelArtsTrainJobCheckInit ModelArtsJobStatus = "CHECK_INIT" //审核作业初始化 + ModelArtsTrainJobCheckRunning ModelArtsJobStatus = "CHECK_RUNNING" //审核作业正在运行中 + ModelArtsTrainJobCheckRunningCompleted ModelArtsJobStatus = "CHECK_RUNNING_COMPLETED" //审核作业已经完成 + ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 ) type Cloudbrain struct { @@ -1091,3 +1116,14 @@ func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool { } return false } + +func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0, 10) + return cloudbrains, x. + NotIn("status", + JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, + ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, + ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). + Limit(100). + Find(&cloudbrains) +} diff --git a/modules/cron/tasks_basic.go b/modules/cron/tasks_basic.go index 207018c20..294690d45 100755 --- a/modules/cron/tasks_basic.go +++ b/modules/cron/tasks_basic.go @@ -185,6 +185,17 @@ func registerHandleSummaryStatistic() { }) } +func registerSyncCloudbrainStatus() { + RegisterTaskFatal("sync_cloudbrain_status", &BaseConfig{ + Enabled: true, + RunAtStart: false, + Schedule: "@every 10m", + }, func(ctx context.Context, _ *models.User, _ Config) error { + repo.SyncCloudbrainStatus() + return nil + }) +} + func initBasicTasks() { registerUpdateMirrorTask() registerRepoHealthCheck() @@ -202,4 +213,6 @@ func initBasicTasks() { registerHandleRepoAndUserStatistic() registerHandleSummaryStatistic() + + registerSyncCloudbrainStatus() } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 28f3a0184..bf0fffc18 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -715,3 +715,78 @@ func downloadRateCode(repo *models.Repository, taskName, gitPath, codePath, benc return nil } + +func SyncCloudbrainStatus() { + cloudBrains, err := models.GetCloudBrainUnStoppedJob() + if err != nil { + log.Error("GetCloudBrainUnStoppedJob failed:", err.Error()) + return + } + + for _, task := range cloudBrains { + if task.Type == models.TypeCloudBrainOne { + result, err := cloudbrain.GetJob(task.JobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", task.JobName, err) + continue + } + + if result != nil { + jobRes, _ := models.ConvertToJobResultPayload(result.Payload) + taskRoles := jobRes.TaskRoles + taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) + task.Status = taskRes.TaskStatuses[0].State + if task.Status != string(models.JobWaiting) { + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } + } else if task.Type == models.TypeCloudBrainTwo { + if task.JobType == string(models.JobTypeDebug) { + result, err := modelarts.GetJob(task.JobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", task.JobName, err) + continue + } + + if result != nil { + task.Status = result.Status + + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } else if task.JobType == string(models.JobTypeTrain) { + result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) + continue + } + + if result != nil { + task.Status = modelarts.TransTrainJobStatus(result.IntStatus) + task.Duration = result.Duration + task.TrainJobDuration = result.TrainJobDuration + + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } else { + log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType) + } + + } else { + log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) + } + } + + return +}