Browse Source

Merge pull request '同步云脑任务状态' (#1000) from sync_cb_status into V20211213

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/1000
Reviewed-by: ychao_1983 <ychao_1983@sina.com>
tags/v1.21.12.1^2
ychao_1983 4 years ago
parent
commit
a463e24741
3 changed files with 124 additions and 0 deletions
  1. +36
    -0
      models/cloudbrain.go
  2. +13
    -0
      modules/cron/tasks_basic.go
  3. +75
    -0
      routers/repo/cloudbrain.go

+ 36
- 0
models/cloudbrain.go View File

@@ -31,6 +31,7 @@ const (
JobTypeBrainScore JobType = "BRAINSCORE" JobTypeBrainScore JobType = "BRAINSCORE"
JobTypeTrain JobType = "TRAIN" JobTypeTrain JobType = "TRAIN"


//notebook
ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败 ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
@@ -46,6 +47,30 @@ const (
ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除 ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中 ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败 ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败

//trainjob
ModelArtsTrainJobUnknown ModelArtsJobStatus = "UNKNOWN" //作业状态未知
ModelArtsTrainJobInit ModelArtsJobStatus = "INIT" //作业初始化状态
ModelArtsTrainJobImageCreating ModelArtsJobStatus = "IMAGE_CREATING" //作业镜像正在创建
ModelArtsTrainJobImageFailed ModelArtsJobStatus = "IMAGE_FAILED" //作业镜像创建失败
ModelArtsTrainJobSubmitTrying ModelArtsJobStatus = "SUBMIT_TRYING" //作业正在提交
ModelArtsTrainJobSubmitFailed ModelArtsJobStatus = "SUBMIT_FAILED" //作业提交失败
ModelArtsTrainJobDeleteFailed ModelArtsJobStatus = "DELETE_FAILED" //作业删除失败
ModelArtsTrainJobWaiting ModelArtsJobStatus = "WAITING" //作业正在排队中
ModelArtsTrainJobRunning ModelArtsJobStatus = "RUNNING" //作业正在运行中
ModelArtsTrainJobKilling ModelArtsJobStatus = "KILLING" //作业正在取消
ModelArtsTrainJobCompleted ModelArtsJobStatus = "COMPLETED" //作业已经完成
ModelArtsTrainJobFailed ModelArtsJobStatus = "FAILED" //作业运行失败
ModelArtsTrainJobKilled ModelArtsJobStatus = "KILLED" //作业取消成功
ModelArtsTrainJobCanceled ModelArtsJobStatus = "CANCELED" //作业取消
ModelArtsTrainJobLost ModelArtsJobStatus = "LOST" //作业丢失
ModelArtsTrainJobScaling ModelArtsJobStatus = "SCALING" //作业正在扩容
ModelArtsTrainJobSubmitModelFailed ModelArtsJobStatus = "SUBMIT_MODEL_FAILED" //提交模型失败
ModelArtsTrainJobDeployServiceFailed ModelArtsJobStatus = "DEPLOY_SERVICE_FAILED" //部署服务失败
ModelArtsTrainJobCheckInit ModelArtsJobStatus = "CHECK_INIT" //审核作业初始化
ModelArtsTrainJobCheckRunning ModelArtsJobStatus = "CHECK_RUNNING" //审核作业正在运行中
ModelArtsTrainJobCheckRunningCompleted ModelArtsJobStatus = "CHECK_RUNNING_COMPLETED" //审核作业已经完成
ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败
) )


type Cloudbrain struct { type Cloudbrain struct {
@@ -1091,3 +1116,14 @@ func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
} }
return false return false
} }

func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
return cloudbrains, x.
NotIn("status",
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted,
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed,
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed).
Limit(100).
Find(&cloudbrains)
}

+ 13
- 0
modules/cron/tasks_basic.go View File

@@ -185,6 +185,17 @@ func registerHandleSummaryStatistic() {
}) })
} }


func registerSyncCloudbrainStatus() {
RegisterTaskFatal("sync_cloudbrain_status", &BaseConfig{
Enabled: true,
RunAtStart: false,
Schedule: "@every 10m",
}, func(ctx context.Context, _ *models.User, _ Config) error {
repo.SyncCloudbrainStatus()
return nil
})
}

func initBasicTasks() { func initBasicTasks() {
registerUpdateMirrorTask() registerUpdateMirrorTask()
registerRepoHealthCheck() registerRepoHealthCheck()
@@ -202,4 +213,6 @@ func initBasicTasks() {


registerHandleRepoAndUserStatistic() registerHandleRepoAndUserStatistic()
registerHandleSummaryStatistic() registerHandleSummaryStatistic()

registerSyncCloudbrainStatus()
} }

+ 75
- 0
routers/repo/cloudbrain.go View File

@@ -715,3 +715,78 @@ func downloadRateCode(repo *models.Repository, taskName, gitPath, codePath, benc


return nil return nil
} }

func SyncCloudbrainStatus() {
cloudBrains, err := models.GetCloudBrainUnStoppedJob()
if err != nil {
log.Error("GetCloudBrainUnStoppedJob failed:", err.Error())
return
}

for _, task := range cloudBrains {
if task.Type == models.TypeCloudBrainOne {
result, err := cloudbrain.GetJob(task.JobID)
if err != nil {
log.Error("GetJob(%s) failed:%v", task.JobName, err)
continue
}

if result != nil {
jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
taskRoles := jobRes.TaskRoles
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
task.Status = taskRes.TaskStatuses[0].State
if task.Status != string(models.JobWaiting) {
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
}
} else if task.Type == models.TypeCloudBrainTwo {
if task.JobType == string(models.JobTypeDebug) {
result, err := modelarts.GetJob(task.JobID)
if err != nil {
log.Error("GetJob(%s) failed:%v", task.JobName, err)
continue
}

if result != nil {
task.Status = result.Status

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
} else if task.JobType == string(models.JobTypeTrain) {
result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
continue
}

if result != nil {
task.Status = modelarts.TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration
task.TrainJobDuration = result.TrainJobDuration

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
} else {
log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType)
}

} else {
log.Error("task.Type(%s) is error:%d", task.JobName, task.Type)
}
}

return
}

Loading…
Cancel
Save