From e2f8b5507ac021f0e032c2e83f1d30417e63d284 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 17 Dec 2021 17:57:02 +0800 Subject: [PATCH] debug --- models/cloudbrain.go | 3 + modules/cloudbrain/cloudbrain.go | 134 ++++++++++++++++++++++++++--- modules/storage/minio.go | 5 ++ routers/repo/cloudbrain.go | 22 ++--- routers/repo/modelarts.go | 11 +-- templates/repo/debugjob/index.tmpl | 0 6 files changed, 145 insertions(+), 30 deletions(-) mode change 100644 => 100755 templates/repo/debugjob/index.tmpl diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 509f4a9ed..c8bd81873 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -88,6 +88,9 @@ type Cloudbrain struct { UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` Duration int64 TrainJobDuration string + Image string //GPU镜像名称 + GpuQueue string //GPU类型即GPU队列 + ResourceSpecId int //GPU规格id DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` CanDel bool `xorm:"-"` diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 0f1c700d2..394afbcfa 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -1,6 +1,7 @@ package cloudbrain import ( + "code.gitea.io/gitea/modules/storage" "errors" "strconv" @@ -185,25 +186,28 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, }, }) if err != nil { - log.Error("CreateJob failed:", err.Error()) + log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) return err } if jobResult.Code != Success { - log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg) + log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) return errors.New(jobResult.Msg) } var jobID = jobResult.Payload["jobId"].(string) err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: string(models.JobWaiting), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: jobName, - SubTaskName: SubTaskName, - JobType: jobType, - Type: models.TypeCloudBrainOne, - Uuid: uuid, + Status: string(models.JobWaiting), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: jobName, + SubTaskName: SubTaskName, + JobType: jobType, + Type: models.TypeCloudBrainOne, + Uuid: uuid, + Image: image, + GpuQueue: gpuQueue, + ResourceSpecId: resourceSpecId, }) if err != nil { @@ -212,3 +216,111 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, return nil } + +func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { + dataActualPath := setting.Attachment.Minio.RealPath + + setting.Attachment.Minio.Bucket + "/" + + setting.Attachment.Minio.BasePath + + models.AttachmentRelativePath(task.Uuid) + + task.Uuid + jobName := task.JobName + + var resourceSpec *models.ResourceSpec + for _, spec := range ResourceSpecs.ResourceSpec { + if task.ResourceSpecId == spec.Id { + resourceSpec = spec + } + } + + if resourceSpec == nil { + log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) + return errors.New("no such resourceSpec") + } + + jobResult, err := CreateJob(jobName, models.CreateJobParams{ + JobName: jobName, + RetryCount: 1, + GpuType: task.GpuQueue, + Image: task.Image, + TaskRoles: []models.TaskRole{ + { + Name: SubTaskName, + TaskNumber: 1, + MinSucceededTaskCount: 1, + MinFailedTaskCount: 1, + CPUNumber: resourceSpec.CpuNum, + GPUNumber: resourceSpec.GpuNum, + MemoryMB: resourceSpec.MemMiB, + ShmMB: resourceSpec.ShareMemMiB, + Command: Command, + NeedIBDevice: false, + IsMainRole: false, + UseNNI: false, + }, + }, + Volumes: []models.Volume{ + { + HostPath: models.StHostPath{ + Path: setting.JobPath + jobName + CodeMountPath, + MountPath: CodeMountPath, + ReadOnly: false, + }, + }, + { + HostPath: models.StHostPath{ + Path: dataActualPath, + MountPath: DataSetMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, ModelMountPath + "/"), + MountPath: ModelMountPath, + ReadOnly: false, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"), + MountPath: BenchMarkMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"), + MountPath: Snn4imagenetMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"), + MountPath: BrainScoreMountPath, + ReadOnly: true, + }, + }, + }, + }) + if err != nil { + log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) + return err + } + if jobResult.Code != Success { + log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) + return errors.New(jobResult.Msg) + } + + var jobID = jobResult.Payload["jobId"].(string) + task.JobID = jobID + task.Status = string(models.JobWaiting) + err = models.UpdateJob(task) + + if err != nil { + log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) + return err + } + + return nil +} diff --git a/modules/storage/minio.go b/modules/storage/minio.go index 664e58d1b..8e85d0eae 100755 --- a/modules/storage/minio.go +++ b/modules/storage/minio.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "code.gitea.io/gitea/modules/setting" "github.com/minio/minio-go" ) @@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error { _, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{}) return err } + +func GetMinioPath(jobName, suffixPath string) string { + return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath +} diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 0374fda25..97415f39e 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -236,9 +236,9 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/") } - err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), - getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) + err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), + storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) @@ -256,6 +256,8 @@ func CloudBrainRestart(ctx *context.Context) { 3、更新此任务的状态 */ + //todo: 是否启用事务? + var jobID = ctx.Params(":jobid") task, err := models.GetCloudbrainByJobID(jobID) if err != nil { @@ -283,13 +285,9 @@ func CloudBrainRestart(ctx *context.Context) { } } - /*jobName := task.JobName - codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - err = cloudbrain.GenerateTask(ctx, jobName, image, cloudbrain.Command, task.Uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), - getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) - */ + err = cloudbrain.RestartTask(ctx, task) if err != nil { + log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) ctx.RenderWithErr(err.Error(), tplCloudBrainIndex, nil) return } @@ -609,7 +607,7 @@ func getImages(ctx *context.Context, imageType string) { func GetModelDirs(jobName string, parentDir string) (string, error) { var req string - modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/") + modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/") if parentDir == "" { req = "baseDir=" + modelActualPath } else { @@ -619,10 +617,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) { return getDirs(req) } -func getMinioPath(jobName, suffixPath string) string { - return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath -} - func CloudBrainDownloadModel(ctx *context.Context) { parentDir := ctx.Query("parentDir") fileName := ctx.Query("fileName") diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index a1e47e11f..67be2be19 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -245,13 +245,13 @@ func NotebookManage(ctx *context.Context) { if action == models.ActionStop { if task.Status != string(models.ModelArtsRunning) { log.Error("the job(%s) is not running", task.JobName) - ctx.ServerError("the job is not running", errors.New("the job is not running")) + ctx.RenderWithErr("the job is not running", tplDebugJobIndex, nil) return } } else if action == models.ActionRestart { if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { log.Error("the job(%s) is not stopped", task.JobName) - ctx.ServerError("the job is not running", errors.New("the job is not running")) + ctx.RenderWithErr("the job is not stopped", tplDebugJobIndex, nil) return } @@ -269,7 +269,7 @@ func NotebookManage(ctx *context.Context) { } } else { log.Error("the action(%s) is illegal", action) - ctx.ServerError("the action is illegal", errors.New("the action is illegal")) + ctx.RenderWithErr("非法操作", tplDebugJobIndex, nil) return } @@ -279,14 +279,15 @@ func NotebookManage(ctx *context.Context) { res, err := modelarts.ManageNotebook(jobID, param) if err != nil { log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error()) - ctx.ServerError("ManageNotebook failed", err) + ctx.RenderWithErr("启动失败", tplDebugJobIndex, nil) return } task.Status = res.CurrentStatus err = models.UpdateJob(task) if err != nil { - ctx.ServerError("UpdateJob failed", err) + log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr("system error", tplDebugJobIndex, nil) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") diff --git a/templates/repo/debugjob/index.tmpl b/templates/repo/debugjob/index.tmpl old mode 100644 new mode 100755