| @@ -88,6 +88,9 @@ type Cloudbrain struct { | |||
| UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
| Duration int64 | |||
| TrainJobDuration string | |||
| Image string //GPU镜像名称 | |||
| GpuQueue string //GPU类型即GPU队列 | |||
| ResourceSpecId int //GPU规格id | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| CanDebug bool `xorm:"-"` | |||
| CanDel bool `xorm:"-"` | |||
| @@ -102,7 +105,7 @@ type Cloudbrain struct { | |||
| IsLatestVersion string //是否是最新版本,1是,0否 | |||
| CommitID string //提交的仓库代码id | |||
| PreVersionName string //父版本名称 | |||
| ComputeResource string //计算资源,例如npu | |||
| ComputeResource string `xorm:"-"` //计算资源,例如npu | |||
| EngineID int64 //引擎id | |||
| TrainUrl string //输出的obs路径 | |||
| @@ -1,6 +1,8 @@ | |||
| package cloudbrain | |||
| import ( | |||
| "code.gitea.io/gitea/modules/storage" | |||
| "encoding/json" | |||
| "errors" | |||
| "strconv" | |||
| @@ -107,6 +109,9 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, | |||
| uuid | |||
| var resourceSpec *models.ResourceSpec | |||
| if ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||
| } | |||
| for _, spec := range ResourceSpecs.ResourceSpec { | |||
| if resourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| @@ -185,28 +190,142 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed:", err.Error()) | |||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| if jobResult.Code != Success { | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg) | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) | |||
| return errors.New(jobResult.Msg) | |||
| } | |||
| var jobID = jobResult.Payload["jobId"].(string) | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: string(models.JobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: jobName, | |||
| SubTaskName: SubTaskName, | |||
| JobType: jobType, | |||
| Type: models.TypeCloudBrainOne, | |||
| Uuid: uuid, | |||
| Status: string(models.JobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: jobName, | |||
| SubTaskName: SubTaskName, | |||
| JobType: jobType, | |||
| Type: models.TypeCloudBrainOne, | |||
| Uuid: uuid, | |||
| Image: image, | |||
| GpuQueue: gpuQueue, | |||
| ResourceSpecId: resourceSpecId, | |||
| }) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { | |||
| dataActualPath := setting.Attachment.Minio.RealPath + | |||
| setting.Attachment.Minio.Bucket + "/" + | |||
| setting.Attachment.Minio.BasePath + | |||
| models.AttachmentRelativePath(task.Uuid) + | |||
| task.Uuid | |||
| jobName := task.JobName | |||
| var resourceSpec *models.ResourceSpec | |||
| if ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||
| } | |||
| for _, spec := range ResourceSpecs.ResourceSpec { | |||
| if task.ResourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| } | |||
| } | |||
| if resourceSpec == nil { | |||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||
| return errors.New("no such resourceSpec") | |||
| } | |||
| jobResult, err := CreateJob(jobName, models.CreateJobParams{ | |||
| JobName: jobName, | |||
| RetryCount: 1, | |||
| GpuType: task.GpuQueue, | |||
| Image: task.Image, | |||
| TaskRoles: []models.TaskRole{ | |||
| { | |||
| Name: SubTaskName, | |||
| TaskNumber: 1, | |||
| MinSucceededTaskCount: 1, | |||
| MinFailedTaskCount: 1, | |||
| CPUNumber: resourceSpec.CpuNum, | |||
| GPUNumber: resourceSpec.GpuNum, | |||
| MemoryMB: resourceSpec.MemMiB, | |||
| ShmMB: resourceSpec.ShareMemMiB, | |||
| Command: Command, | |||
| NeedIBDevice: false, | |||
| IsMainRole: false, | |||
| UseNNI: false, | |||
| }, | |||
| }, | |||
| Volumes: []models.Volume{ | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, CodeMountPath + "/"), | |||
| MountPath: CodeMountPath, | |||
| ReadOnly: false, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: dataActualPath, | |||
| MountPath: DataSetMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, ModelMountPath + "/"), | |||
| MountPath: ModelMountPath, | |||
| ReadOnly: false, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"), | |||
| MountPath: BenchMarkMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"), | |||
| MountPath: Snn4imagenetMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| { | |||
| HostPath: models.StHostPath{ | |||
| Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"), | |||
| MountPath: BrainScoreMountPath, | |||
| ReadOnly: true, | |||
| }, | |||
| }, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| if jobResult.Code != Success { | |||
| log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) | |||
| return errors.New(jobResult.Msg) | |||
| } | |||
| var jobID = jobResult.Payload["jobId"].(string) | |||
| task.JobID = jobID | |||
| task.Status = string(models.JobWaiting) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| @@ -277,7 +277,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| DatasetName: attach.Name, | |||
| CommitID: req.CommitID, | |||
| IsLatestVersion: req.IsLatestVersion, | |||
| ComputeResource: NPUResource, | |||
| //ComputeResource: NPUResource, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| BranchName: req.BranchName, | |||
| @@ -174,7 +174,7 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func StopJob(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||
| func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookActionResult | |||
| @@ -207,8 +207,8 @@ sendjob: | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| @@ -11,6 +11,7 @@ import ( | |||
| "strings" | |||
| "time" | |||
| "code.gitea.io/gitea/modules/setting" | |||
| "github.com/minio/minio-go" | |||
| ) | |||
| @@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error { | |||
| _, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{}) | |||
| return err | |||
| } | |||
| func GetMinioPath(jobName, suffixPath string) string { | |||
| return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath | |||
| } | |||
| @@ -787,6 +787,7 @@ model_noright=无权限操作 | |||
| model_rename=模型名称重复,请修改模型名称 | |||
| debug=调试 | |||
| debug_again=再次调试 | |||
| stop=停止 | |||
| delete=删除 | |||
| model_download=模型下载 | |||
| @@ -206,7 +206,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| } | |||
| repo := ctx.Repo.Repository | |||
| downloadCode(repo, codePath) | |||
| uploadCodeToMinio(codePath+"/", jobName, "/code/") | |||
| uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/") | |||
| modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" | |||
| mkModelPath(modelPath) | |||
| @@ -236,9 +236,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/") | |||
| } | |||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | |||
| getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) | |||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), | |||
| storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | |||
| storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | |||
| storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) | |||
| if err != nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) | |||
| @@ -247,6 +248,72 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") | |||
| } | |||
| func CloudBrainRestart(ctx *context.Context) { | |||
| var jobID = ctx.Params(":jobid") | |||
| var resultCode = "0" | |||
| var errorMsg = "" | |||
| var status = "" | |||
| for { | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) { | |||
| log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job is not stopped" | |||
| break | |||
| } | |||
| if task.Image == "" || task.GpuQueue == "" || task.Type != models.TypeCloudBrainOne { | |||
| log.Error("the job(%s) version is too old", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job's version is too old and can not be restarted" | |||
| break | |||
| } | |||
| count, err := models.GetCloudbrainCountByUserID(ctx.User.ID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the user already has running or waiting task" | |||
| break | |||
| } | |||
| } | |||
| err = cloudbrain.RestartTask(ctx, task) | |||
| if err != nil { | |||
| log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| status = task.Status | |||
| jobID = task.JobID | |||
| break | |||
| } | |||
| ctx.JSON(200, map[string]string{ | |||
| "result_code": resultCode, | |||
| "error_msg": errorMsg, | |||
| "status": status, | |||
| "job_id": jobID, | |||
| }) | |||
| } | |||
| func CloudBrainShow(ctx *context.Context) { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| @@ -351,32 +418,53 @@ func CloudBrainCommitImage(ctx *context.Context, form auth.CommitImageCloudBrain | |||
| func CloudBrainStop(ctx *context.Context) { | |||
| var jobID = ctx.Params(":jobid") | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||
| return | |||
| } | |||
| var resultCode = "0" | |||
| var errorMsg = "" | |||
| var status = "" | |||
| if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { | |||
| log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) | |||
| ctx.ServerError("the job has been stopped", errors.New("the job has been stopped")) | |||
| return | |||
| } | |||
| for { | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| err = cloudbrain.StopJob(jobID) | |||
| if err != nil { | |||
| log.Error("StopJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"]) | |||
| ctx.ServerError("StopJob failed", err) | |||
| return | |||
| } | |||
| if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { | |||
| log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| task.Status = string(models.JobStopped) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| ctx.ServerError("UpdateJob failed", err) | |||
| return | |||
| err = cloudbrain.StopJob(jobID) | |||
| if err != nil { | |||
| log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| task.Status = string(models.JobStopped) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| status = task.Status | |||
| break | |||
| } | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") | |||
| ctx.JSON(200, map[string]string{ | |||
| "result_code": resultCode, | |||
| "error_msg": errorMsg, | |||
| "status": status, | |||
| "job_id": jobID, | |||
| }) | |||
| } | |||
| func StopJobsByUserID(userID int64) { | |||
| @@ -423,7 +511,7 @@ func StopJobs(cloudBrains []*models.Cloudbrain) { | |||
| Action: models.ActionStop, | |||
| } | |||
| err := retry(3, time.Second*30, func() error { | |||
| _, err := modelarts.StopJob(taskInfo.JobID, param) | |||
| _, err := modelarts.ManageNotebook(taskInfo.JobID, param) | |||
| return err | |||
| }) | |||
| logErrorAndUpdateJobStatus(err, taskInfo) | |||
| @@ -560,7 +648,7 @@ func getImages(ctx *context.Context, imageType string) { | |||
| func GetModelDirs(jobName string, parentDir string) (string, error) { | |||
| var req string | |||
| modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/") | |||
| modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/") | |||
| if parentDir == "" { | |||
| req = "baseDir=" + modelActualPath | |||
| } else { | |||
| @@ -570,10 +658,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) { | |||
| return getDirs(req) | |||
| } | |||
| func getMinioPath(jobName, suffixPath string) string { | |||
| return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath | |||
| } | |||
| func CloudBrainDownloadModel(ctx *context.Context) { | |||
| parentDir := ctx.Query("parentDir") | |||
| fileName := ctx.Query("fileName") | |||
| @@ -42,6 +42,7 @@ const ( | |||
| func DebugJobIndex(ctx *context.Context) { | |||
| debugListType := ctx.Query("debugListType") | |||
| ctx.Data["ListType"] = debugListType | |||
| MustEnableCloudbrain(ctx) | |||
| repo := ctx.Repo.Repository | |||
| page := ctx.QueryInt("page") | |||
| @@ -73,21 +74,19 @@ func DebugJobIndex(ctx *context.Context) { | |||
| } | |||
| for i, task := range ciTasks { | |||
| ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) | |||
| ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) | |||
| if task.Cloudbrain.Type == models.TypeCloudBrainOne { | |||
| ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) | |||
| ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) | |||
| ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource | |||
| } | |||
| if task.Cloudbrain.Type == models.TypeCloudBrainTwo { | |||
| ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) | |||
| ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) | |||
| } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { | |||
| ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource | |||
| } | |||
| } | |||
| pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) | |||
| pager.SetDefaultParams(ctx) | |||
| //pager.SetDefaultParams(ctx) | |||
| pager.AddParam(ctx, "debugListType", "ListType") | |||
| ctx.Data["Page"] = pager | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| ctx.Data["Tasks"] = ciTasks | |||
| @@ -232,38 +231,91 @@ func NotebookDebug(ctx *context.Context) { | |||
| ctx.Redirect(debugUrl) | |||
| } | |||
| func NotebookStop(ctx *context.Context) { | |||
| func NotebookManage(ctx *context.Context) { | |||
| var jobID = ctx.Params(":jobid") | |||
| log.Info(jobID) | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||
| return | |||
| } | |||
| var action = ctx.Params(":action") | |||
| var resultCode = "0" | |||
| var errorMsg = "" | |||
| var status = "" | |||
| if task.Status != string(models.JobRunning) { | |||
| log.Error("the job(%s) is not running", task.JobName) | |||
| ctx.ServerError("the job is not running", errors.New("the job is not running")) | |||
| return | |||
| } | |||
| for { | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| param := models.NotebookAction{ | |||
| Action: models.ActionStop, | |||
| } | |||
| res, err := modelarts.StopJob(jobID, param) | |||
| if err != nil { | |||
| log.Error("StopJob(%s) failed:%v", task.JobName, err.Error()) | |||
| ctx.ServerError("StopJob failed", err) | |||
| return | |||
| } | |||
| if action == models.ActionStop { | |||
| if task.Status != string(models.ModelArtsRunning) { | |||
| log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job is not running" | |||
| break | |||
| } | |||
| } else if action == models.ActionRestart { | |||
| if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { | |||
| log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job is not stopped" | |||
| break | |||
| } | |||
| task.Status = res.CurrentStatus | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| ctx.ServerError("UpdateJob failed", err) | |||
| return | |||
| count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "you have already a running or waiting task, can not create more" | |||
| break | |||
| } | |||
| } | |||
| action = models.ActionStart | |||
| } else { | |||
| log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "非法操作" | |||
| break | |||
| } | |||
| param := models.NotebookAction{ | |||
| Action: action, | |||
| } | |||
| res, err := modelarts.ManageNotebook(jobID, param) | |||
| if err != nil { | |||
| log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "启动失败" | |||
| break | |||
| } | |||
| task.Status = res.CurrentStatus | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| status = task.Status | |||
| break | |||
| } | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") | |||
| ctx.JSON(200, map[string]string{ | |||
| "result_code": resultCode, | |||
| "error_msg": errorMsg, | |||
| "status": status, | |||
| "job_id": jobID, | |||
| }) | |||
| } | |||
| func NotebookDel(ctx *context.Context) { | |||
| @@ -323,6 +375,7 @@ func TrainJobIndex(ctx *context.Context) { | |||
| for i, task := range tasks { | |||
| tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) | |||
| tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) | |||
| tasks[i].ComputeResource = modelarts.NPUResource | |||
| } | |||
| pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) | |||
| @@ -968,6 +968,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Post("/commit_image", cloudbrain.AdminOrOwnerOrJobCreaterRight, bindIgnErr(auth.CommitImageCloudBrainForm{}), repo.CloudBrainCommitImage) | |||
| m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) | |||
| m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDel) | |||
| m.Post("/restart", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainRestart) | |||
| m.Get("/rate", reqRepoCloudBrainReader, repo.GetRate) | |||
| m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) | |||
| m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDownloadModel) | |||
| @@ -1003,7 +1004,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Group("/:jobid", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||
| m.Get("/debug", reqRepoCloudBrainWriter, repo.NotebookDebug) | |||
| m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookStop) | |||
| m.Post("/:action", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookManage) | |||
| m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) | |||
| }) | |||
| m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) | |||
| @@ -202,14 +202,20 @@ | |||
| <div class="rect5"></div> | |||
| </div> | |||
| </div> | |||
| <!-- 提示框 --> | |||
| <div class="alert"></div> | |||
| <div class="repository release dataset-list view"> | |||
| {{template "repo/header" .}} | |||
| <!-- {{template "base/alert" .}} --> | |||
| <!-- 提示框 --> | |||
| <!-- 列表容器 --> | |||
| <div class="ui container"> | |||
| <div class="ui two column stackable grid "> | |||
| <div class="ui negative message" style="display: none;"> | |||
| <i class="close icon"></i> | |||
| <p></p> | |||
| </div> | |||
| <div class="ui two column stackable grid"> | |||
| <div class="column"> | |||
| <div class="ui blue small menu compact selectcloudbrain"> | |||
| <a class="active item" href="{{.RepoLink}}/debugjob?debugListType=all">{{$.i18n.Tr "repo.modelarts.notebook"}}</a> | |||
| @@ -282,7 +288,7 @@ | |||
| <div class="row"> | |||
| <!-- 任务名 --> | |||
| <div class="four wide column"> | |||
| <a class="title" href='{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}' title="{{.JobName}}" style="font-size: 14px;"> | |||
| <a class="title" href='{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}' title="{{.JobName}}" style="font-size: 14px;"> | |||
| <span class="fitted text_over" style="width: 90%;vertical-align: middle;">{{.JobName}}</span> | |||
| </a> | |||
| </div> | |||
| @@ -315,34 +321,44 @@ | |||
| </a> | |||
| {{end}} --> | |||
| <!-- 调试 --> | |||
| {{if .CanDebug}} | |||
| {{if eq .ComputeResource "CPU/GPU"}} | |||
| <a id="model-debug-{{.JobID}}" class='ui basic {{if ne .Status "RUNNING"}} disabled {{else}}blue {{end}}button' href="{{$.RepoLink}}/cloudbrain/{{.JobID}}/debug" target="_blank"> | |||
| {{$.i18n.Tr "repo.debug"}} | |||
| </a> | |||
| <form id="debugAgainForm-{{.JobID}}"> | |||
| {{$.CsrfTokenHtml}} | |||
| {{if .CanDebug}} | |||
| {{if eq .Status "RUNNING"}} | |||
| <a style="margin: 0 1rem;" id="model-debug-{{.JobID}}" class='ui basic blue button' onclick='debugAgain("{{.JobID}}","{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/")'> | |||
| {{$.i18n.Tr "repo.debug"}} | |||
| </a> | |||
| {{else}} | |||
| <a id="model-debug-{{.JobID}}" class='ui basic {{if eq .Status "CREATING" "STOPPING" "WAITING" "STARTING"}} disabled {{else}}blue {{end}}button' onclick='debugAgain("{{.JobID}}","{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/")'> | |||
| {{$.i18n.Tr "repo.debug_again"}} | |||
| </a> | |||
| {{end}} | |||
| {{else}} | |||
| <a id="model-debug-{{.JobID}}" class='ui basic {{if ne .Status "RUNNING"}} disabled {{else}}blue {{end}}button' href="{{$.RepoLink}}/modelarts/notebook/{{.JobID}}/debug" target="_blank"> | |||
| {{$.i18n.Tr "repo.debug"}} | |||
| <a class="ui basic disabled button"> | |||
| {{$.i18n.Tr "repo.debug_again"}} | |||
| </a> | |||
| {{end}} | |||
| {{else}} | |||
| <a class="ui basic disabled button"> | |||
| {{$.i18n.Tr "repo.debug"}} | |||
| </a> | |||
| {{end}} | |||
| </form> | |||
| <!-- 停止 --> | |||
| <form id="stopForm-{{.JobID}}" action="{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/stop" method="post" style="margin-left:-1px;"> | |||
| <form id="stopForm-{{.JobID}}" style="margin-left:-1px;"> | |||
| {{$.CsrfTokenHtml}} | |||
| {{if .CanDel}} | |||
| <a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick="document.getElementById('stopForm-{{.JobID}}').submit();"> | |||
| {{$.i18n.Tr "repo.stop"}} | |||
| </a> | |||
| {{if eq .ComputeResource "CPU/GPU" }} | |||
| <a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick='stopDebug("{{.JobID}}","{{$.RepoLink}}/cloudbrain/{{.JobID}}/stop")'> | |||
| {{$.i18n.Tr "repo.stop"}} | |||
| </a> | |||
| {{else}} | |||
| <a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick='stopDebug("{{.JobID}}","{{$.RepoLink}}/modelarts/notebook/{{.JobID}}/stop")'> | |||
| {{$.i18n.Tr "repo.stop"}} | |||
| </a> | |||
| {{end}} | |||
| {{else}} | |||
| <a class="ui basic disabled button" onclick="document.getElementById('stopForm-{{.JobID}}').submit();"> | |||
| <a class="ui basic disabled button"> | |||
| {{$.i18n.Tr "repo.stop"}} | |||
| </a> | |||
| {{end}} | |||
| <input type="hidden" name="debugListType" value="all"> | |||
| </form> | |||
| <!-- 删除 --> | |||
| <form id="delForm-{{.JobID}}" action="{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/del" method="post"> | |||
| @@ -463,10 +479,21 @@ | |||
| <script> | |||
| // 调试和评分新开窗口 | |||
| const {AppSubUrl, StaticUrlPrefix, csrf} = window.config; | |||
| let url={{.RepoLink}} | |||
| let getParam=location.search.split('?debugListType=').pop() | |||
| let getParam=getQueryVariable('debugListType') | |||
| let dropdownValue = getParam==='all'||getParam==='' ? '全部' : getParam | |||
| localStorage.setItem('all',location.href) | |||
| function getQueryVariable(variable) | |||
| { | |||
| let query = window.location.search.substring(1); | |||
| let vars = query.split("&"); | |||
| for (let i=0;i<vars.length;i++) { | |||
| let pair = vars[i].split("="); | |||
| if(pair[0] == variable){return pair[1];} | |||
| } | |||
| return(false); | |||
| } | |||
| function stop(obj) { | |||
| if (obj.style.color != "rgb(204, 204, 204)") { | |||
| obj.target = '_blank' | |||
| @@ -499,7 +526,68 @@ | |||
| .modal('show') | |||
| } | |||
| } | |||
| function debugAgain(JobID,debugUrl){ | |||
| if($('#' + JobID+ '-text').text()==="RUNNING"){ | |||
| window.open(debugUrl+'debug') | |||
| }else{ | |||
| $.ajax({ | |||
| type:"POST", | |||
| url:debugUrl+'restart', | |||
| data:$('#debugAgainForm-'+JobID).serialize(), | |||
| success:function(res){ | |||
| if(res.result_code==="0"){ | |||
| if(res.job_id!==JobID){ | |||
| location.reload() | |||
| }else{ | |||
| $('#' + JobID+'-icon').removeClass().addClass(res.status) | |||
| $('#' + JobID+ '-text').text(res.status) | |||
| $('#model-debug-'+JobID).removeClass('blue').addClass('disabled') | |||
| $('#model-delete-'+JobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| }else{ | |||
| $(".ui.negative.message").css("display","block") | |||
| $(".ui.negative.message p").text(res.error_msg) | |||
| setTimeout("$('.message .close').click()",3000) | |||
| } | |||
| }, | |||
| error :function(res){ | |||
| console.log(res) | |||
| } | |||
| }) | |||
| } | |||
| } | |||
| function stopDebug(JobID,stopUrl){ | |||
| $.ajax({ | |||
| type:"POST", | |||
| url:stopUrl, | |||
| data:$('#stopForm-'+JobID).serialize(), | |||
| success:function(res){ | |||
| if(res.result_code==="0"){ | |||
| $('#' + JobID+'-icon').removeClass().addClass(res.status) | |||
| $('#' + JobID+ '-text').text(res.status) | |||
| if(res.status==="STOPPED"){ | |||
| $('#model-debug-'+JobID).removeClass('blue').addClass('disabled').text("再次调试").css("margin","0") | |||
| $('#model-image-'+JobID).removeClass('blue').addClass('disabled') | |||
| $('#stop-model-debug-'+JobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| else{ | |||
| $('#model-debug-'+JobID).removeClass('blue').addClass('disabled') | |||
| $('#stop-model-debug-'+JobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| }else{ | |||
| $("ui.negative.message").text(res.error_msg) | |||
| } | |||
| }, | |||
| error :function(res){ | |||
| console.log(res) | |||
| } | |||
| }) | |||
| } | |||
| // 加载任务状态 | |||
| var timeid = window.setInterval(loadJobStatus, 15000); | |||
| $(document).ready(loadJobStatus); | |||
| @@ -508,8 +596,9 @@ | |||
| const jobID = job.dataset.jobid; | |||
| const repoPath = job.dataset.repopath; | |||
| const computeResource = job.dataset.resource | |||
| const initArray = ['STOPPED','FAILED','START_FAILED','CREATE_FAILED'] | |||
| const initArray = ['STOPPED','FAILED','START_FAILED','CREATE_FAILED','SUCCEEDED'] | |||
| if (initArray.includes(job.textContent.trim())) { | |||
| return | |||
| } | |||
| const diffResource = computeResource == "NPU" ? 'modelarts/notebook' : 'cloudbrain' | |||
| @@ -521,32 +610,30 @@ | |||
| $('#' + jobID+ '-text').text(status) | |||
| } | |||
| if(status==="RUNNING"){ | |||
| $('#model-debug-'+jobID).removeClass('disabled') | |||
| $('#model-debug-'+jobID).addClass('blue') | |||
| $('#model-image-'+jobID).removeClass('disabled') | |||
| $('#model-image-'+jobID).addClass('blue') | |||
| $('#model-debug-'+jobID).removeClass('disabled').addClass('blue').text('调试').css("margin","0 1rem") | |||
| $('#model-image-'+jobID).removeClass('disabled').addClass('blue') | |||
| } | |||
| if(status!=="RUNNING"){ | |||
| $('#model-debug-'+jobID).removeClass('blue') | |||
| $('#model-debug-'+jobID).addClass('disabled') | |||
| $('#model-image-'+jobID).removeClass('blue') | |||
| $('#model-image-'+jobID).addClass('disabled') | |||
| // $('#model-debug-'+jobID).removeClass('blue') | |||
| // $('#model-debug-'+jobID).addClass('disabled') | |||
| $('#model-image-'+jobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| if(["CREATING","STOPPING","WAITING","STARTING"].includes(status)){ | |||
| $('#model-debug-'+jobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| if(['STOPPED','FAILED','START_FAILED','CREATE_FAILED','SUCCEEDED'].includes(status)){ | |||
| $('#model-debug-'+jobID).removeClass('disabled').addClass('blue').text('再次调试').css("margin","0") | |||
| } | |||
| if(["RUNNING","WAITING"].includes(status)){ | |||
| $('#stop-model-debug-'+jobID).removeClass('disabled') | |||
| $('#stop-model-debug-'+jobID).addClass('blue') | |||
| $('#stop-model-debug-'+jobID).removeClass('disabled').addClass('blue') | |||
| } | |||
| if(["CREATING","STOPPING","STARTING","STOPPED","FAILED","START_FAILED"].includes(status)){ | |||
| $('#stop-model-debug-'+jobID).removeClass('blue') | |||
| $('#stop-model-debug-'+jobID).addClass('disabled') | |||
| $('#stop-model-debug-'+jobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| if(status==="STOPPED" || status==="FAILED"|| status==="START_FAILED"){ | |||
| $('#model-delete-'+jobID).removeClass('disabled') | |||
| $('#model-delete-'+jobID).addClass('blue') | |||
| $('#model-delete-'+jobID).removeClass('disabled').addClass('blue') | |||
| }else{ | |||
| $('#model-delete-'+jobID).removeClass('blue') | |||
| $('#model-delete-'+jobID).addClass('disabled') | |||
| $('#model-delete-'+jobID).removeClass('blue').addClass('disabled') | |||
| } | |||
| }).fail(function(err) { | |||
| console.log(err); | |||
| @@ -554,6 +641,7 @@ | |||
| }); | |||
| }; | |||
| $(document).ready(function(){ | |||
| dropdownValue = dropdownValue==="CPU%2FGPU"? 'CPU/GPU' : dropdownValue | |||
| $('.default.text').text(dropdownValue) | |||
| $('.ui.dropdown') | |||
| .dropdown({ | |||
| @@ -564,6 +652,12 @@ | |||
| location.href = `${url}/debugjob?debugListType=${value}` | |||
| } | |||
| }) | |||
| $('.message .close') | |||
| .on('click', function() { | |||
| $(this) | |||
| .closest('.message') | |||
| .transition('fade') | |||
| }) | |||
| }) | |||
| @@ -601,7 +695,6 @@ | |||
| // 显示弹窗,弹出相应的信息 | |||
| function showmask() { | |||
| var image_tag = !$('#image_tag').val() | |||
| console.log("image_tag",image_tag) | |||
| if(image_tag){ | |||
| return | |||
| } | |||
| @@ -149,6 +149,12 @@ | |||
| {{.i18n.Tr "repo.model_manager"}} | |||
| </a> | |||
| {{end}} | |||
| {{if .Permission.CanRead $.UnitTypeModelManage}} | |||
| <a class="{{if .isModelManage}}active{{end}} item" href="{{.RepoLink}}/modelmanage/show_model"> | |||
| <svg class="svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="16" height="16"><path fill="none" d="M0 0h24v24H0z"/><path d="M3.741 1.408l18.462 10.154a.5.5 0 0 1 0 .876L3.741 22.592A.5.5 0 0 1 3 22.154V1.846a.5.5 0 0 1 .741-.438zM5 13v6.617L18.85 12 5 4.383V11h5v2H5z"/></svg> | |||
| {{.i18n.Tr "repo.model_manager"}} | |||
| </a> | |||
| {{end}} | |||
| {{if .Permission.CanRead $.UnitTypeCloudBrain}} | |||
| <a class="{{if .PageIsCloudBrain}}active{{end}} item" href="{{.RepoLink}}/debugjob?debugListType=all"> | |||
| <span> | |||