| @@ -88,6 +88,9 @@ type Cloudbrain struct { | |||
| UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
| Duration int64 | |||
| TrainJobDuration string | |||
| Image string //GPU镜像名称 | |||
| GpuQueue string //GPU类型即GPU队列 | |||
| ResourceSpecId int //GPU规格id | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| CanDebug bool `xorm:"-"` | |||
| CanDel bool `xorm:"-"` | |||
| @@ -1,6 +1,7 @@ | |||
| package cloudbrain | |||
| import ( | |||
| "code.gitea.io/gitea/modules/storage" | |||
| "errors" | |||
| "strconv" | |||
| @@ -185,25 +186,28 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed:", err.Error()) | |||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| if jobResult.Code != Success { | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg) | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) | |||
| return errors.New(jobResult.Msg) | |||
| } | |||
| var jobID = jobResult.Payload["jobId"].(string) | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: string(models.JobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: jobName, | |||
| SubTaskName: SubTaskName, | |||
| JobType: jobType, | |||
| Type: models.TypeCloudBrainOne, | |||
| Uuid: uuid, | |||
| Status: string(models.JobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: jobName, | |||
| SubTaskName: SubTaskName, | |||
| JobType: jobType, | |||
| Type: models.TypeCloudBrainOne, | |||
| Uuid: uuid, | |||
| Image: image, | |||
| GpuQueue: gpuQueue, | |||
| ResourceSpecId: resourceSpecId, | |||
| }) | |||
| if err != nil { | |||
| @@ -212,3 +216,111 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, | |||
| return nil | |||
| } | |||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { | |||
| dataActualPath := setting.Attachment.Minio.RealPath + | |||
| setting.Attachment.Minio.Bucket + "/" + | |||
| setting.Attachment.Minio.BasePath + | |||
| models.AttachmentRelativePath(task.Uuid) + | |||
| task.Uuid | |||
| jobName := task.JobName | |||
| var resourceSpec *models.ResourceSpec | |||
| for _, spec := range ResourceSpecs.ResourceSpec { | |||
| if task.ResourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| } | |||
| } | |||
| if resourceSpec == nil { | |||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||
| return errors.New("no such resourceSpec") | |||
| } | |||
| jobResult, err := CreateJob(jobName, models.CreateJobParams{ | |||
| JobName: jobName, | |||
| RetryCount: 1, | |||
| GpuType: task.GpuQueue, | |||
| Image: task.Image, | |||
| TaskRoles: []models.TaskRole{ | |||
| { | |||
| Name: SubTaskName, | |||
| TaskNumber: 1, | |||
| MinSucceededTaskCount: 1, | |||
| MinFailedTaskCount: 1, | |||
| CPUNumber: resourceSpec.CpuNum, | |||
| GPUNumber: resourceSpec.GpuNum, | |||
| MemoryMB: resourceSpec.MemMiB, | |||
| ShmMB: resourceSpec.ShareMemMiB, | |||
| Command: Command, | |||
| NeedIBDevice: false, | |||
| IsMainRole: false, | |||
| UseNNI: false, | |||
| }, | |||
| }, | |||
| Volumes: []models.Volume{ | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: setting.JobPath + jobName + CodeMountPath, | |||
| MountPath: CodeMountPath, | |||
| ReadOnly: false, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: dataActualPath, | |||
| MountPath: DataSetMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, ModelMountPath + "/"), | |||
| MountPath: ModelMountPath, | |||
| ReadOnly: false, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"), | |||
| MountPath: BenchMarkMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"), | |||
| MountPath: Snn4imagenetMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"), | |||
| MountPath: BrainScoreMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| if jobResult.Code != Success { | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) | |||
| return errors.New(jobResult.Msg) | |||
| } | |||
| var jobID = jobResult.Payload["jobId"].(string) | |||
| task.JobID = jobID | |||
| task.Status = string(models.JobWaiting) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| @@ -11,6 +11,7 @@ import ( | |||
| "strings" | |||
| "time" | |||
| "code.gitea.io/gitea/modules/setting" | |||
| "github.com/minio/minio-go" | |||
| ) | |||
| @@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error { | |||
| _, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{}) | |||
| return err | |||
| } | |||
| func GetMinioPath(jobName, suffixPath string) string { | |||
| return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath | |||
| } | |||
| @@ -236,9 +236,9 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/") | |||
| } | |||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) | |||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | |||
| storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | |||
| storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) | |||
| if err != nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) | |||
| @@ -256,6 +256,8 @@ func CloudBrainRestart(ctx *context.Context) { | |||
| 3、更新此任务的状态 | |||
| */ | |||
| //todo: 是否启用事务? | |||
| var jobID = ctx.Params(":jobid") | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| @@ -283,13 +285,9 @@ func CloudBrainRestart(ctx *context.Context) { | |||
| } | |||
| } | |||
| /*jobName := task.JobName | |||
| codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath | |||
| err = cloudbrain.GenerateTask(ctx, jobName, image, cloudbrain.Command, task.Uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) | |||
| */ | |||
| err = cloudbrain.RestartTask(ctx, task) | |||
| if err != nil { | |||
| log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) | |||
| ctx.RenderWithErr(err.Error(), tplCloudBrainIndex, nil) | |||
| return | |||
| } | |||
| @@ -609,7 +607,7 @@ func getImages(ctx *context.Context, imageType string) { | |||
| func GetModelDirs(jobName string, parentDir string) (string, error) { | |||
| var req string | |||
| modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/") | |||
| modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/") | |||
| if parentDir == "" { | |||
| req = "baseDir=" + modelActualPath | |||
| } else { | |||
| @@ -619,10 +617,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) { | |||
| return getDirs(req) | |||
| } | |||
| func getMinioPath(jobName, suffixPath string) string { | |||
| return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath | |||
| } | |||
| func CloudBrainDownloadModel(ctx *context.Context) { | |||
| parentDir := ctx.Query("parentDir") | |||
| fileName := ctx.Query("fileName") | |||
| @@ -245,13 +245,13 @@ func NotebookManage(ctx *context.Context) { | |||
| if action == models.ActionStop { | |||
| if task.Status != string(models.ModelArtsRunning) { | |||
| log.Error("the job(%s) is not running", task.JobName) | |||
| ctx.ServerError("the job is not running", errors.New("the job is not running")) | |||
| ctx.RenderWithErr("the job is not running", tplDebugJobIndex, nil) | |||
| return | |||
| } | |||
| } else if action == models.ActionRestart { | |||
| if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { | |||
| log.Error("the job(%s) is not stopped", task.JobName) | |||
| ctx.ServerError("the job is not running", errors.New("the job is not running")) | |||
| ctx.RenderWithErr("the job is not stopped", tplDebugJobIndex, nil) | |||
| return | |||
| } | |||
| @@ -269,7 +269,7 @@ func NotebookManage(ctx *context.Context) { | |||
| } | |||
| } else { | |||
| log.Error("the action(%s) is illegal", action) | |||
| ctx.ServerError("the action is illegal", errors.New("the action is illegal")) | |||
| ctx.RenderWithErr("非法操作", tplDebugJobIndex, nil) | |||
| return | |||
| } | |||
| @@ -279,14 +279,15 @@ func NotebookManage(ctx *context.Context) { | |||
| res, err := modelarts.ManageNotebook(jobID, param) | |||
| if err != nil { | |||
| log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error()) | |||
| ctx.ServerError("ManageNotebook failed", err) | |||
| ctx.RenderWithErr("启动失败", tplDebugJobIndex, nil) | |||
| return | |||
| } | |||
| task.Status = res.CurrentStatus | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| ctx.ServerError("UpdateJob failed", err) | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error()) | |||
| ctx.RenderWithErr("system error", tplDebugJobIndex, nil) | |||
| return | |||
| } | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") | |||