diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 8927147af..49cb0bbb1 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -89,6 +89,9 @@ type Cloudbrain struct { UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` Duration int64 TrainJobDuration string + Image string //GPU镜像名称 + GpuQueue string //GPU类型即GPU队列 + ResourceSpecId int //GPU规格id DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` CanDel bool `xorm:"-"` @@ -103,7 +106,7 @@ type Cloudbrain struct { IsLatestVersion string //是否是最新版本,1是,0否 CommitID string //提交的仓库代码id PreVersionName string //父版本名称 - ComputeResource string //计算资源,例如npu + ComputeResource string `xorm:"-"` //计算资源,例如npu EngineID int64 //引擎id TrainUrl string //输出模型的obs路径 diff --git a/models/user_business_analysis.go b/models/user_business_analysis.go index 9b9d5e0ad..a15b9db5f 100644 --- a/models/user_business_analysis.go +++ b/models/user_business_analysis.go @@ -1,7 +1,6 @@ package models import ( - "encoding/json" "fmt" "sort" "strconv" @@ -202,15 +201,7 @@ func QueryUserStaticDataAll(opts *UserBusinessAnalysisQueryOptions) ([]*UserBusi return nil, 0 } log.Info("query return total:" + fmt.Sprint(allCount)) - if allCount == 0 { - CommitCodeSizeMap, err := GetAllUserKPIStats() - if err != nil { - log.Info("query commit code errr.") - } else { - log.Info("query commit code size, len=" + fmt.Sprint(len(CommitCodeSizeMap))) - } - RefreshUserStaticAllTabel(make(map[string]int), CommitCodeSizeMap) - } + pageSize := 1000 totalPage := int(allCount) / pageSize userBusinessAnalysisReturnList := UserBusinessAnalysisAllList{} @@ -370,7 +361,7 @@ func RefreshUserStaticAllTabel(wikiCountMap map[string]int, CommitCodeSizeMap ma CodeMergeCountMap := queryPullRequest(start_unix, end_unix) CommitCountMap := queryCommitAction(start_unix, end_unix, 5) - IssueCountMap := queryAction(start_unix, end_unix, 6) + IssueCountMap := queryCreateIssue(start_unix, end_unix) CommentCountMap := queryComment(start_unix, end_unix) FocusRepoCountMap := queryWatch(start_unix, end_unix) @@ -395,7 +386,7 @@ func RefreshUserStaticAllTabel(wikiCountMap map[string]int, CommitCodeSizeMap ma var indexTotal int64 indexTotal = 0 for { - sess.Select("`user`.*").Table("user").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("`user`.*").Table("user").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) userList := make([]*User, 0) sess.Find(&userList) for i, userRecord := range userList { @@ -528,7 +519,7 @@ func CounDataByDateAndReCount(wikiCountMap map[string]int, startTime time.Time, DataDate := startTime.Format("2006-01-02") CodeMergeCountMap := queryPullRequest(start_unix, end_unix) CommitCountMap := queryCommitAction(start_unix, end_unix, 5) - IssueCountMap := queryAction(start_unix, end_unix, 6) + IssueCountMap := queryCreateIssue(start_unix, end_unix) CommentCountMap := queryComment(start_unix, end_unix) FocusRepoCountMap := queryWatch(start_unix, end_unix) @@ -559,7 +550,7 @@ func CounDataByDateAndReCount(wikiCountMap map[string]int, startTime time.Time, var indexTotal int64 indexTotal = 0 for { - sess.Select("`user`.*").Table("user").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("`user`.*").Table("user").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) userList := make([]*User, 0) sess.Find(&userList) @@ -709,7 +700,7 @@ func querySolveIssue(start_unix int64, end_unix int64) map[int64]int { issueAssigneesList := make([]*IssueAssignees, 0) sess.Select("issue_assignees.*").Table("issue_assignees"). Join("inner", "issue", "issue.id=issue_assignees.issue_id"). - Where(cond).Limit(Page_SIZE, int(indexTotal)) + Where(cond).OrderBy("issue_assignees.id asc").Limit(Page_SIZE, int(indexTotal)) sess.Find(&issueAssigneesList) @@ -744,7 +735,7 @@ func queryPullRequest(start_unix int64, end_unix int64) map[int64]int { indexTotal = 0 for { issueList := make([]*Issue, 0) - sess.Select("issue.*").Table("issue").Join("inner", "pull_request", "issue.id=pull_request.issue_id").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("issue.*").Table("issue").Join("inner", "pull_request", "issue.id=pull_request.issue_id").Where(cond).OrderBy("issue.id asc").Limit(Page_SIZE, int(indexTotal)) sess.Find(&issueList) log.Info("query issue(PR) size=" + fmt.Sprint(len(issueList))) for _, issueRecord := range issueList { @@ -777,7 +768,7 @@ func queryCommitAction(start_unix int64, end_unix int64, actionType int64) map[i var indexTotal int64 indexTotal = 0 for { - sess.Select("id,user_id,op_type,act_user_id").Table("action").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,user_id,op_type,act_user_id").Table("action").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) actionList := make([]*Action, 0) sess.Find(&actionList) @@ -799,29 +790,30 @@ func queryCommitAction(start_unix int64, end_unix int64, actionType int64) map[i return resultMap } -func queryAction(start_unix int64, end_unix int64, actionType int64) map[int64]int { +func queryCreateIssue(start_unix int64, end_unix int64) map[int64]int { + sess := x.NewSession() defer sess.Close() resultMap := make(map[int64]int) - cond := "op_type=" + fmt.Sprint(actionType) + " and created_unix>=" + fmt.Sprint(start_unix) + " and created_unix<=" + fmt.Sprint(end_unix) + cond := "is_pull=false and created_unix>=" + fmt.Sprint(start_unix) + " and created_unix<=" + fmt.Sprint(end_unix) - count, err := sess.Where(cond).Count(new(Action)) + count, err := sess.Where(cond).Count(new(Issue)) if err != nil { - log.Info("query Action error. return.") + log.Info("query Issue error. return.") return resultMap } var indexTotal int64 indexTotal = 0 for { - sess.Select("id,user_id,op_type,act_user_id").Table("action").Where(cond).Limit(Page_SIZE, int(indexTotal)) - actionList := make([]*Action, 0) - sess.Find(&actionList) - log.Info("query action size=" + fmt.Sprint(len(actionList))) - for _, actionRecord := range actionList { - if _, ok := resultMap[actionRecord.UserID]; !ok { - resultMap[actionRecord.UserID] = 1 + sess.Select("id,poster_id").Table("issue").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) + issueList := make([]*Issue, 0) + sess.Find(&issueList) + log.Info("query issue size=" + fmt.Sprint(len(issueList))) + for _, issueRecord := range issueList { + if _, ok := resultMap[issueRecord.PosterID]; !ok { + resultMap[issueRecord.PosterID] = 1 } else { - resultMap[actionRecord.UserID] += 1 + resultMap[issueRecord.PosterID] += 1 } } indexTotal += Page_SIZE @@ -830,6 +822,7 @@ func queryAction(start_unix int64, end_unix int64, actionType int64) map[int64]i } } return resultMap + } func queryComment(start_unix int64, end_unix int64) map[int64]int { @@ -846,7 +839,7 @@ func queryComment(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - sess.Select("id,type,poster_id").Table("comment").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,type,poster_id").Table("comment").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) commentList := make([]*Comment, 0) sess.Find(&commentList) log.Info("query Comment size=" + fmt.Sprint(len(commentList))) @@ -882,7 +875,7 @@ func queryWatch(start_unix int64, end_unix int64) map[int64]int { indexTotal = 0 for { watchList := make([]*Watch, 0) - sess.Select("id,user_id,repo_id").Table("watch").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,user_id,repo_id").Table("watch").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) sess.Find(&watchList) log.Info("query Watch size=" + fmt.Sprint(len(watchList))) @@ -920,7 +913,7 @@ func queryStar(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - sess.Select("id,uid,repo_id").Table("star").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,uid,repo_id").Table("star").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) starList := make([]*Star, 0) sess.Find(&starList) @@ -956,7 +949,7 @@ func queryFollow(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - sess.Select("id,user_id,follow_id").Table("follow").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,user_id,follow_id").Table("follow").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) followList := make([]*Follow, 0) sess.Find(&followList) @@ -992,7 +985,7 @@ func queryDatasetSize(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - sess.Select("id,uploader_id,size").Table("attachment").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,uploader_id,size").Table("attachment").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) attachmentList := make([]*Attachment, 0) sess.Find(&attachmentList) @@ -1028,7 +1021,7 @@ func queryUserCreateRepo(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - sess.Select("id,owner_id,name").Table("repository").Where(cond).Limit(Page_SIZE, int(indexTotal)) + sess.Select("id,owner_id,name").Table("repository").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) repoList := make([]*Repository, 0) sess.Find(&repoList) log.Info("query Repository size=" + fmt.Sprint(len(repoList))) @@ -1099,8 +1092,7 @@ func queryUserRepoOpenIIndex(start_unix int64, end_unix int64) map[int64]float64 } } - userMapJson, _ := json.Marshal(userMap) - log.Info("userMapJson=" + string(userMapJson)) + log.Info("user openi index size=" + fmt.Sprint(len(userMap))) return userMap } @@ -1119,7 +1111,7 @@ func queryLoginCount(start_unix int64, end_unix int64) map[int64]int { var indexTotal int64 indexTotal = 0 for { - statictisSess.Select("id,u_id").Table("user_login_log").Where(cond).Limit(Page_SIZE, int(indexTotal)) + statictisSess.Select("id,u_id").Table("user_login_log").Where(cond).OrderBy("id asc").Limit(Page_SIZE, int(indexTotal)) userLoginLogList := make([]*UserLoginLog, 0) statictisSess.Find(&userLoginLogList) log.Info("query user login size=" + fmt.Sprint(len(userLoginLogList))) @@ -1135,7 +1127,7 @@ func queryLoginCount(start_unix int64, end_unix int64) map[int64]int { break } } - + log.Info("user login size=" + fmt.Sprint(len(resultMap))) return resultMap } diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 0f1c700d2..aa5d8467c 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -1,6 +1,8 @@ package cloudbrain import ( + "code.gitea.io/gitea/modules/storage" + "encoding/json" "errors" "strconv" @@ -107,6 +109,9 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, uuid var resourceSpec *models.ResourceSpec + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + } for _, spec := range ResourceSpecs.ResourceSpec { if resourceSpecId == spec.Id { resourceSpec = spec @@ -185,28 +190,142 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, }, }) if err != nil { - log.Error("CreateJob failed:", err.Error()) + log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) return err } if jobResult.Code != Success { - log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg) + log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) return errors.New(jobResult.Msg) } var jobID = jobResult.Payload["jobId"].(string) err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: string(models.JobWaiting), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: jobName, - SubTaskName: SubTaskName, - JobType: jobType, - Type: models.TypeCloudBrainOne, - Uuid: uuid, + Status: string(models.JobWaiting), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: jobName, + SubTaskName: SubTaskName, + JobType: jobType, + Type: models.TypeCloudBrainOne, + Uuid: uuid, + Image: image, + GpuQueue: gpuQueue, + ResourceSpecId: resourceSpecId, + }) + + if err != nil { + return err + } + + return nil +} + +func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { + dataActualPath := setting.Attachment.Minio.RealPath + + setting.Attachment.Minio.Bucket + "/" + + setting.Attachment.Minio.BasePath + + models.AttachmentRelativePath(task.Uuid) + + task.Uuid + jobName := task.JobName + + var resourceSpec *models.ResourceSpec + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + } + for _, spec := range ResourceSpecs.ResourceSpec { + if task.ResourceSpecId == spec.Id { + resourceSpec = spec + } + } + + if resourceSpec == nil { + log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) + return errors.New("no such resourceSpec") + } + + jobResult, err := CreateJob(jobName, models.CreateJobParams{ + JobName: jobName, + RetryCount: 1, + GpuType: task.GpuQueue, + Image: task.Image, + TaskRoles: []models.TaskRole{ + { + Name: SubTaskName, + TaskNumber: 1, + MinSucceededTaskCount: 1, + MinFailedTaskCount: 1, + CPUNumber: resourceSpec.CpuNum, + GPUNumber: resourceSpec.GpuNum, + MemoryMB: resourceSpec.MemMiB, + ShmMB: resourceSpec.ShareMemMiB, + Command: Command, + NeedIBDevice: false, + IsMainRole: false, + UseNNI: false, + }, + }, + Volumes: []models.Volume{ + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, CodeMountPath + "/"), + MountPath: CodeMountPath, + ReadOnly: false, + }, + }, + { + HostPath: models.StHostPath{ + Path: dataActualPath, + MountPath: DataSetMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, ModelMountPath + "/"), + MountPath: ModelMountPath, + ReadOnly: false, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"), + MountPath: BenchMarkMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"), + MountPath: Snn4imagenetMountPath, + ReadOnly: true, + }, + }, + { + HostPath: models.StHostPath{ + Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"), + MountPath: BrainScoreMountPath, + ReadOnly: true, + }, + }, + }, }) + if err != nil { + log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) + return err + } + if jobResult.Code != Success { + log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) + return errors.New(jobResult.Msg) + } + + var jobID = jobResult.Payload["jobId"].(string) + task.JobID = jobID + task.Status = string(models.JobWaiting) + err = models.UpdateJob(task) if err != nil { + log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) return err } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 1225ea82b..351ebbd12 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -306,7 +306,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error DatasetName: attach.Name, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, - ComputeResource: NPUResource, + //ComputeResource: NPUResource, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index 2cc9e34be..07f26ceb7 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -174,7 +174,7 @@ sendjob: return &result, nil } -func StopJob(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { +func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { checkSetting() client := getRestyClient() var result models.NotebookActionResult @@ -207,8 +207,8 @@ sendjob: } if len(response.ErrorCode) != 0 { - log.Error("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) } return &result, nil diff --git a/modules/storage/minio.go b/modules/storage/minio.go index 664e58d1b..8e85d0eae 100755 --- a/modules/storage/minio.go +++ b/modules/storage/minio.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "code.gitea.io/gitea/modules/setting" "github.com/minio/minio-go" ) @@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error { _, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{}) return err } + +func GetMinioPath(jobName, suffixPath string) string { + return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath +} diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index f5a52f2cb..7c821824d 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -787,6 +787,7 @@ model_noright=无权限操作 model_rename=模型名称重复,请修改模型名称 debug=调试 +debug_again=再次调试 stop=停止 delete=删除 model_download=模型下载 diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 3f5fce013..f4a0bde49 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -206,7 +206,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } repo := ctx.Repo.Repository downloadCode(repo, codePath) - uploadCodeToMinio(codePath+"/", jobName, "/code/") + uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/") modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" mkModelPath(modelPath) @@ -236,9 +236,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/") } - err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"), - getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) + err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), + storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) @@ -247,6 +248,72 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") } +func CloudBrainRestart(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + var resultCode = "0" + var errorMsg = "" + var status = "" + + for { + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) { + log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "the job is not stopped" + break + } + + if task.Image == "" || task.GpuQueue == "" || task.Type != models.TypeCloudBrainOne { + log.Error("the job(%s) version is too old", task.JobName, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "the job's version is too old and can not be restarted" + break + } + + count, err := models.GetCloudbrainCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "the user already has running or waiting task" + break + } + } + + err = cloudbrain.RestartTask(ctx, task) + if err != nil { + log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + status = task.Status + jobID = task.JobID + + break + } + + ctx.JSON(200, map[string]string{ + "result_code": resultCode, + "error_msg": errorMsg, + "status": status, + "job_id": jobID, + }) +} + func CloudBrainShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true @@ -351,32 +418,53 @@ func CloudBrainCommitImage(ctx *context.Context, form auth.CommitImageCloudBrain func CloudBrainStop(ctx *context.Context) { var jobID = ctx.Params(":jobid") - task, err := models.GetCloudbrainByJobID(jobID) - if err != nil { - ctx.ServerError("GetCloudbrainByJobID failed", err) - return - } + var resultCode = "0" + var errorMsg = "" + var status = "" - if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { - log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) - ctx.ServerError("the job has been stopped", errors.New("the job has been stopped")) - return - } + for { + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } - err = cloudbrain.StopJob(jobID) - if err != nil { - log.Error("StopJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"]) - ctx.ServerError("StopJob failed", err) - return - } + if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { + log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } - task.Status = string(models.JobStopped) - err = models.UpdateJob(task) - if err != nil { - ctx.ServerError("UpdateJob failed", err) - return + err = cloudbrain.StopJob(jobID) + if err != nil { + log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + task.Status = string(models.JobStopped) + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + status = task.Status + break } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") + + ctx.JSON(200, map[string]string{ + "result_code": resultCode, + "error_msg": errorMsg, + "status": status, + "job_id": jobID, + }) } func StopJobsByUserID(userID int64) { @@ -423,7 +511,7 @@ func StopJobs(cloudBrains []*models.Cloudbrain) { Action: models.ActionStop, } err := retry(3, time.Second*30, func() error { - _, err := modelarts.StopJob(taskInfo.JobID, param) + _, err := modelarts.ManageNotebook(taskInfo.JobID, param) return err }) logErrorAndUpdateJobStatus(err, taskInfo) @@ -560,7 +648,7 @@ func getImages(ctx *context.Context, imageType string) { func GetModelDirs(jobName string, parentDir string) (string, error) { var req string - modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/") + modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/") if parentDir == "" { req = "baseDir=" + modelActualPath } else { @@ -570,10 +658,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) { return getDirs(req) } -func getMinioPath(jobName, suffixPath string) string { - return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath -} - func CloudBrainDownloadModel(ctx *context.Context) { parentDir := ctx.Query("parentDir") fileName := ctx.Query("fileName") diff --git a/routers/repo/milestone.go b/routers/repo/milestone.go index e30e6371f..41f1f88bc 100644 --- a/routers/repo/milestone.go +++ b/routers/repo/milestone.go @@ -268,6 +268,7 @@ func MilestoneIssuesAndPulls(ctx *context.Context) { ctx.Data["CanWriteIssues"] = ctx.Repo.CanWriteIssuesOrPulls(false) ctx.Data["CanWritePulls"] = ctx.Repo.CanWriteIssuesOrPulls(true) + ctx.Data["PageIsIssueList"] = true ctx.HTML(200, tplMilestoneIssues) } diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 0ae433654..21dcdc593 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -47,6 +47,7 @@ const ( func DebugJobIndex(ctx *context.Context) { debugListType := ctx.Query("debugListType") + ctx.Data["ListType"] = debugListType MustEnableCloudbrain(ctx) repo := ctx.Repo.Repository page := ctx.QueryInt("page") @@ -78,21 +79,19 @@ func DebugJobIndex(ctx *context.Context) { } for i, task := range ciTasks { + ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) + ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) + if task.Cloudbrain.Type == models.TypeCloudBrainOne { - ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) - ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource - } - if task.Cloudbrain.Type == models.TypeCloudBrainTwo { - ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx) - ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) + } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource } - } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) - pager.SetDefaultParams(ctx) + //pager.SetDefaultParams(ctx) + pager.AddParam(ctx, "debugListType", "ListType") ctx.Data["Page"] = pager ctx.Data["PageIsCloudBrain"] = true ctx.Data["Tasks"] = ciTasks @@ -237,38 +236,91 @@ func NotebookDebug(ctx *context.Context) { ctx.Redirect(debugUrl) } -func NotebookStop(ctx *context.Context) { +func NotebookManage(ctx *context.Context) { var jobID = ctx.Params(":jobid") - log.Info(jobID) - task, err := models.GetCloudbrainByJobID(jobID) - if err != nil { - ctx.ServerError("GetCloudbrainByJobID failed", err) - return - } + var action = ctx.Params(":action") + var resultCode = "0" + var errorMsg = "" + var status = "" - if task.Status != string(models.JobRunning) { - log.Error("the job(%s) is not running", task.JobName) - ctx.ServerError("the job is not running", errors.New("the job is not running")) - return - } + for { + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } - param := models.NotebookAction{ - Action: models.ActionStop, - } - res, err := modelarts.StopJob(jobID, param) - if err != nil { - log.Error("StopJob(%s) failed:%v", task.JobName, err.Error()) - ctx.ServerError("StopJob failed", err) - return - } + if action == models.ActionStop { + if task.Status != string(models.ModelArtsRunning) { + log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "the job is not running" + break + } + } else if action == models.ActionRestart { + if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { + log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "the job is not stopped" + break + } - task.Status = res.CurrentStatus - err = models.UpdateJob(task) - if err != nil { - ctx.ServerError("UpdateJob failed", err) - return + count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "you have already a running or waiting task, can not create more" + break + } + } + + action = models.ActionStart + } else { + log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "非法操作" + break + } + + param := models.NotebookAction{ + Action: action, + } + res, err := modelarts.ManageNotebook(jobID, param) + if err != nil { + log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "启动失败" + break + } + + task.Status = res.CurrentStatus + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + status = task.Status + + break } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") + + ctx.JSON(200, map[string]string{ + "result_code": resultCode, + "error_msg": errorMsg, + "status": status, + "job_id": jobID, + }) } func NotebookDel(ctx *context.Context) { @@ -279,7 +331,7 @@ func NotebookDel(ctx *context.Context) { return } - if task.Status != string(models.JobStopped) { + if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped){ log.Error("the job(%s) has not been stopped", task.JobName) ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped")) return @@ -328,6 +380,7 @@ func TrainJobIndex(ctx *context.Context) { for i, task := range tasks { tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) + tasks[i].ComputeResource = modelarts.NPUResource } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index d838ebaad..55ee2b750 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -968,6 +968,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/commit_image", cloudbrain.AdminOrOwnerOrJobCreaterRight, bindIgnErr(auth.CommitImageCloudBrainForm{}), repo.CloudBrainCommitImage) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDel) + m.Post("/restart", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainRestart) m.Get("/rate", reqRepoCloudBrainReader, repo.GetRate) m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDownloadModel) @@ -1003,7 +1004,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/:jobid", func() { m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) m.Get("/debug", reqRepoCloudBrainWriter, repo.NotebookDebug) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookStop) + m.Post("/:action", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookManage) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) }) m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) diff --git a/templates/repo/debugjob/index.tmpl b/templates/repo/debugjob/index.tmpl old mode 100644 new mode 100755 index 585b8dce8..19c2e6a94 --- a/templates/repo/debugjob/index.tmpl +++ b/templates/repo/debugjob/index.tmpl @@ -202,14 +202,20 @@
- - +