| @@ -65,6 +65,8 @@ const ( | |||
| ActionCreateImage //36 | |||
| ActionImageRecommend //37 | |||
| ActionChangeUserAvatar //38 | |||
| ActionCreateGrampusNPUDebugTask //39 | |||
| ActionCreateGrampusGPUDebugTask //40 | |||
| ) | |||
| // Action represents user operation type and other information to | |||
| @@ -375,6 +377,8 @@ func (a *Action) IsCloudbrainAction() bool { | |||
| ActionCreateInferenceTask, | |||
| ActionCreateBenchMarkTask, | |||
| ActionCreateGPUTrainTask, | |||
| ActionCreateGrampusGPUDebugTask, | |||
| ActionCreateGrampusNPUDebugTask, | |||
| ActionCreateGrampusNPUTrainTask, | |||
| ActionCreateGrampusGPUTrainTask: | |||
| return true | |||
| @@ -1442,6 +1442,20 @@ type GrampusJobInfo struct { | |||
| UserID string `json:"userId"` | |||
| Tasks []GrampusTasks `json:"tasks"` | |||
| } | |||
| type GrampusNotebookInfo struct { | |||
| StartedAt int64 `json:"startedAt"` | |||
| RunSec int64 `json:"runSec"` | |||
| CompletedAt int64 `json:"completedAt"` | |||
| CreatedAt int64 `json:"createdAt"` | |||
| UpdatedAt int64 `json:"updatedAt"` | |||
| Desc string `json:"desc"` | |||
| JobID string `json:"id"` | |||
| Name string `json:"name"` | |||
| Status string `json:"status"` | |||
| UserID string `json:"userId"` | |||
| Tasks []GrampusNotebookTask `json:"tasks"` | |||
| } | |||
| type Center struct { | |||
| ID string `json:"id"` | |||
| Name string `json:"name"` | |||
| @@ -1518,6 +1532,11 @@ type GetGrampusJobResponse struct { | |||
| JobInfo GrampusJobInfo `json:"otJob"` | |||
| } | |||
| type GrampusNotebookResponse struct { | |||
| GrampusResult | |||
| JobInfo GrampusNotebookInfo `json:"otJob"` | |||
| } | |||
| type GrampusStopJobResponse struct { | |||
| GrampusResult | |||
| StoppedAt int64 `json:"stoppedAt"` | |||
| @@ -1537,6 +1556,21 @@ type GrampusTasks struct { | |||
| Code GrampusDataset `json:"code"` | |||
| BootFile string `json:"bootFile"` | |||
| } | |||
| type GrampusNotebookTask struct { | |||
| AutoStopDuration int `json:"autoStopDuration"` | |||
| Name string `json:"name"` | |||
| Capacity int `json:"capacity"` | |||
| CenterID []string `json:"centerID"` | |||
| CenterName []string `json:"centerName"` | |||
| Code GrampusDataset `json:"code"` | |||
| Datasets []GrampusDataset `json:"datasets"` | |||
| ImageId string `json:"imageId"` | |||
| ImageUrl string `json:"imageUrl"` | |||
| ResourceSpecId string `json:"resourceSpecId"` | |||
| Token string `json:"token"` | |||
| Url string `json:"url"` | |||
| Status string `json:"status"` | |||
| } | |||
| type GrampusDataset struct { | |||
| Name string `json:"name"` | |||
| @@ -1550,6 +1584,11 @@ type CreateGrampusJobRequest struct { | |||
| Tasks []GrampusTasks `json:"tasks"` | |||
| } | |||
| type CreateGrampusNotebookRequest struct { | |||
| Name string `json:"name"` | |||
| Tasks []GrampusNotebookTask `json:"tasks"` | |||
| } | |||
| type GetTrainJobMetricStatisticResult struct { | |||
| TrainJobResult | |||
| Interval int `json:"interval"` //查询的时间间隔,单位为分钟 | |||
| @@ -36,6 +36,8 @@ func GetTaskTypeFromAction(a ActionType) TaskType { | |||
| ActionCreateInferenceTask, | |||
| ActionCreateBenchMarkTask, | |||
| ActionCreateGPUTrainTask, | |||
| ActionCreateGrampusGPUDebugTask, | |||
| ActionCreateGrampusNPUDebugTask, | |||
| ActionCreateGrampusNPUTrainTask, | |||
| ActionCreateGrampusGPUTrainTask: | |||
| return TaskCreateCloudbrainTask | |||
| @@ -29,3 +29,19 @@ type CreateGrampusTrainJobForm struct { | |||
| func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||
| return validate(errs, ctx.Data, f, ctx.Locale) | |||
| } | |||
| type CreateGrampusNotebookForm struct { | |||
| Type int `form:"type"` | |||
| DisplayJobName string `form:"display_job_name" binding:"Required"` | |||
| Attachment string `form:"attachment"` | |||
| ImageID string `form:"image_id" binding:"Required"` | |||
| Description string `form:"description"` | |||
| BranchName string `form:"branch_name" binding:"Required"` | |||
| Image string `form:"image" binding:"Required"` | |||
| DatasetName string `form:"dataset_name"` | |||
| SpecId int64 `form:"spec_id" binding:"Required"` | |||
| } | |||
| func (f *CreateGrampusNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||
| return validate(errs, ctx.Data, f, ctx.Locale) | |||
| } | |||
| @@ -28,6 +28,7 @@ const ( | |||
| BucketRemote = "grampus" | |||
| RemoteModelPath = "/output/" + models.ModelSuffix | |||
| autoStopDurationMs = 4 * 60 * 60 * 1000 | |||
| ) | |||
| var ( | |||
| @@ -81,6 +82,25 @@ type GenerateTrainJobReq struct { | |||
| CodeName string | |||
| } | |||
| type GenerateNotebookJobReq struct { | |||
| JobName string | |||
| Command string | |||
| ImageUrl string | |||
| ImageId string | |||
| DisplayJobName string | |||
| Uuid string | |||
| Description string | |||
| CodeObsPath string | |||
| CommitID string | |||
| BranchName string | |||
| ComputeResource string | |||
| ProcessType string | |||
| DatasetNames string | |||
| DatasetInfos map[string]models.DatasetInfo | |||
| Spec *models.Specification | |||
| CodeName string | |||
| } | |||
| func getEndPoint() string { | |||
| index := strings.Index(setting.Endpoint, "//") | |||
| endpoint := setting.Endpoint[index+2:] | |||
| @@ -102,6 +122,82 @@ func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.Gram | |||
| return datasetGrampus | |||
| } | |||
| func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (jobId string, err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| var datasetGrampus []models.GrampusDataset | |||
| var codeGrampus models.GrampusDataset | |||
| if ProcessorTypeNPU == req.ProcessType { | |||
| datasetGrampus = getDatasetGrampus(req.DatasetInfos) | |||
| codeGrampus = models.GrampusDataset{ | |||
| Name: req.CodeName, | |||
| Bucket: setting.Bucket, | |||
| EndPoint: getEndPoint(), | |||
| ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip", | |||
| } | |||
| } | |||
| jobResult, err := createNotebookJob(models.CreateGrampusNotebookRequest{ | |||
| Name: req.JobName, | |||
| Tasks: []models.GrampusNotebookTask{ | |||
| { | |||
| Name: req.JobName, | |||
| ResourceSpecId: req.Spec.SourceSpecId, | |||
| ImageId: req.ImageId, | |||
| ImageUrl: req.ImageUrl, | |||
| Datasets: datasetGrampus, | |||
| Code: codeGrampus, | |||
| AutoStopDuration:autoStopDurationMs, | |||
| Capacity: setting.Capacity, | |||
| }, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("createNotebookJob failed: %v", err.Error()) | |||
| return "", err | |||
| } | |||
| jobID := jobResult.JobInfo.JobID | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.JobInfo.Status), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeDebug), | |||
| Type: models.TypeC2Net, | |||
| Uuid: req.Uuid, | |||
| DatasetName: req.DatasetNames, | |||
| CommitID: req.CommitID, | |||
| IsLatestVersion: "1", | |||
| ComputeResource: req.ComputeResource, | |||
| ImageID: req.ImageId, | |||
| BranchName: req.BranchName, | |||
| Description: req.Description, | |||
| WorkServerNumber: 1, | |||
| EngineName: req.ImageUrl, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| Spec: req.Spec, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) | |||
| return "", err | |||
| } | |||
| var actionType models.ActionType | |||
| if req.ComputeResource == models.NPUResource { | |||
| actionType = models.ActionCreateGrampusNPUDebugTask | |||
| } else if req.ComputeResource == models.GPUResource { | |||
| actionType = models.ActionCreateGrampusGPUDebugTask | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType) | |||
| return jobID, nil | |||
| } | |||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| @@ -26,6 +26,7 @@ const ( | |||
| urlGetResourceSpecs = urlOpenApiV1 + "resourcespec" | |||
| urlGetAiCenter = urlOpenApiV1 + "sharescreen/aicenter" | |||
| urlGetImages = urlOpenApiV1 + "image" | |||
| urlNotebookJob = urlOpenApiV1 + "notebook" | |||
| errorIllegalToken = 1005 | |||
| ) | |||
| @@ -87,6 +88,39 @@ func getToken() error { | |||
| return nil | |||
| } | |||
| func createNotebookJob(req models.CreateGrampusNotebookRequest) (*models.GrampusNotebookResponse, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GrampusNotebookResponse | |||
| retry := 0 | |||
| sendjob: | |||
| _, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(req). | |||
| SetResult(&result). | |||
| Post(HOST + urlNotebookJob) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty CreateNotebookJob: %s", err) | |||
| } | |||
| if result.ErrorCode == errorIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if result.ErrorCode != 0 { | |||
| log.Error("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -120,6 +154,39 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetNotebookJob(jobID string)(*models.GrampusNotebookResponse, error){ | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GrampusNotebookResponse | |||
| retry := 0 | |||
| sendjob: | |||
| _, err := client.R(). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + urlNotebookJob + "/" + jobID) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetNotebookJob: %v", err) | |||
| } | |||
| if result.ErrorCode == errorIllegalToken && retry < 1 { | |||
| retry++ | |||
| log.Info("retry get token") | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if result.ErrorCode != 0 { | |||
| log.Error("GetNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| return nil, fmt.Errorf("GetNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetJob(jobID string) (*models.GetGrampusJobResponse, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -4,7 +4,6 @@ import ( | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| "path" | |||
| "strconv" | |||
| "strings" | |||
| @@ -15,20 +14,13 @@ import ( | |||
| "code.gitea.io/gitea/modules/log" | |||
| "code.gitea.io/gitea/modules/notification" | |||
| "code.gitea.io/gitea/modules/setting" | |||
| "code.gitea.io/gitea/modules/storage" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| ) | |||
| const ( | |||
| //notebook | |||
| storageTypeOBS = "obs" | |||
| autoStopDuration = 4 * 60 * 60 | |||
| autoStopDurationMs = 4 * 60 * 60 * 1000 | |||
| MORDELART_USER_IMAGE_ENGINE_ID = -1 | |||
| DataSetMountPath = "/home/ma-user/work" | |||
| NotebookEnv = "Python3" | |||
| NotebookType = "Ascend" | |||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||
| //train-job | |||
| // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||
| @@ -185,14 +177,6 @@ type OrgMultiNode struct { | |||
| Node []int `json:"node"` | |||
| } | |||
| // type Parameter struct { | |||
| // Label string `json:"label"` | |||
| // Value string `json:"value"` | |||
| // } | |||
| // type Parameters struct { | |||
| // Parameter []Parameter `json:"parameter"` | |||
| // } | |||
| type Parameters struct { | |||
| Parameter []struct { | |||
| @@ -201,80 +185,6 @@ type Parameters struct { | |||
| } `json:"parameter"` | |||
| } | |||
| func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error { | |||
| var dataActualPath string | |||
| if uuid != "" { | |||
| dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | |||
| } else { | |||
| userPath := setting.UserBasePath + ctx.User.Name + "/" | |||
| isExist, err := storage.ObsHasObject(userPath) | |||
| if err != nil { | |||
| log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| if !isExist { | |||
| if err = storage.ObsCreateObject(userPath); err != nil { | |||
| log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"]) | |||
| return err | |||
| } | |||
| } | |||
| dataActualPath = setting.Bucket + "/" + userPath | |||
| } | |||
| if poolInfos == nil { | |||
| json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) | |||
| } | |||
| createTime := timeutil.TimeStampNow() | |||
| jobResult, err := CreateJob(models.CreateNotebookParams{ | |||
| JobName: jobName, | |||
| Description: description, | |||
| ProfileID: setting.ProfileID, | |||
| Flavor: flavor, | |||
| Pool: models.Pool{ | |||
| ID: poolInfos.PoolInfo[0].PoolId, | |||
| Name: poolInfos.PoolInfo[0].PoolName, | |||
| Type: poolInfos.PoolInfo[0].PoolType, | |||
| }, | |||
| Spec: models.Spec{ | |||
| Storage: models.Storage{ | |||
| Type: storageTypeOBS, | |||
| Location: models.Location{ | |||
| Path: dataActualPath, | |||
| }, | |||
| }, | |||
| AutoStop: models.AutoStop{ | |||
| Enable: true, | |||
| Duration: autoStopDuration, | |||
| }, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: string(models.JobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobResult.ID, | |||
| JobName: jobName, | |||
| JobType: string(models.JobTypeDebug), | |||
| Type: models.TypeCloudBrainTwo, | |||
| Uuid: uuid, | |||
| ComputeResource: models.NPUResource, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask) | |||
| return nil | |||
| } | |||
| func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { | |||
| if poolInfos == nil { | |||
| @@ -88,7 +88,7 @@ func getModelArtsImages(ctx *context.APIContext) { | |||
| } | |||
| func getC2netNpuImages(ctx *context.APIContext) { | |||
| images, err := grampus.GetImages(grampus.ProcessorTypeNPU) | |||
| images, err := grampus.GetImages(grampus.ProcessorTypeNPU, string(models.JobTypeTrain)) | |||
| var npuImageInfos []NPUImageINFO | |||
| if err != nil { | |||
| log.Error("GetImages failed:", err.Error()) | |||
| @@ -47,12 +47,34 @@ const ( | |||
| tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" | |||
| //GPU | |||
| tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new" | |||
| tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" | |||
| //NPU | |||
| tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new" | |||
| tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" | |||
| ) | |||
| func GrampusNotebookNew(ctx *context.Context) { | |||
| ctx.Data["IsCreate"] = true | |||
| notebookType := ctx.QueryInt("type") | |||
| processType := grampus.ProcessorTypeGPU | |||
| if notebookType == 1 { | |||
| processType = grampus.ProcessorTypeNPU | |||
| } | |||
| err := grampusNotebookNewDataPrepare(ctx, processType) | |||
| if err != nil { | |||
| ctx.ServerError("get new notebook-job info failed", err) | |||
| return | |||
| } | |||
| if processType == grampus.ProcessorTypeGPU { | |||
| ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew) | |||
| } else { | |||
| ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew) | |||
| } | |||
| } | |||
| func GrampusTrainJobGPUNew(ctx *context.Context) { | |||
| ctx.Data["IsCreate"] = true | |||
| err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) | |||
| @@ -73,8 +95,125 @@ func GrampusTrainJobNPUNew(ctx *context.Context) { | |||
| } | |||
| ctx.HTML(200, tplGrampusTrainJobNPUNew) | |||
| } | |||
| func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) { | |||
| ctx.Data["IsCreate"] = true | |||
| displayJobName := form.DisplayJobName | |||
| jobName := util.ConvertDisplayJobNameToJobName(displayJobName) | |||
| uuid := form.Attachment | |||
| description := form.Description | |||
| repo := ctx.Repo.Repository | |||
| branchName := form.BranchName | |||
| image := strings.TrimSpace(form.Image) | |||
| tpl := tplGrampusNotebookGPUNew | |||
| processType := grampus.ProcessorTypeGPU | |||
| computeSource := models.GPUResource | |||
| computeSourceSimple := models.GPU | |||
| if form.Type == 1 { | |||
| tpl = tplGrampusNotebookNPUNew | |||
| processType = grampus.ProcessorTypeNPU | |||
| computeSource = models.NPUResource | |||
| computeSourceSimple := models.NPU | |||
| } | |||
| lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName)) | |||
| defer lock.UnLock() | |||
| isOk, err := lock.Lock(models.CloudbrainKeyDuration) | |||
| if !isOk { | |||
| log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) | |||
| grampusNotebookNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form) | |||
| return | |||
| } | |||
| func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { | |||
| if !jobNamePattern.MatchString(displayJobName) { | |||
| grampusNotebookNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) | |||
| return | |||
| } | |||
| //check count limit | |||
| count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource) | |||
| if err != nil { | |||
| log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr("system error", tpl, &form) | |||
| return | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) | |||
| return | |||
| } | |||
| } | |||
| //check whether the task name in the project is duplicated | |||
| tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName) | |||
| if err == nil { | |||
| if len(tasks) != 0 { | |||
| log.Error("the job name did already exist", ctx.Data["MsgID"]) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr("the job name did already exist", tpl, &form) | |||
| return | |||
| } | |||
| } else { | |||
| if !models.IsErrJobNotExist(err) { | |||
| log.Error("system error, %v", err, ctx.Data["MsgID"]) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr("system error", tpl, &form) | |||
| return | |||
| } | |||
| } | |||
| //check specification | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeDebug, | |||
| ComputeResource: computeSourceSimple, | |||
| Cluster: models.C2NetCluster, | |||
| }) | |||
| if err != nil || spec == nil { | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr("Resource specification not available", tpl, &form) | |||
| return | |||
| } | |||
| if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { | |||
| log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form) | |||
| return | |||
| } | |||
| commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) | |||
| command := "" | |||
| req := &grampus.GenerateNotebookReq{ | |||
| JobName: jobName, | |||
| DisplayJobName: displayJobName, | |||
| ComputeResource: computeSource, | |||
| ProcessType: processType, | |||
| Command: command, | |||
| ImageUrl: image, | |||
| ImageId: form.ImageID, | |||
| Description: description, | |||
| Uuid: uuid, | |||
| CommitID: commitID, | |||
| BranchName: branchName, | |||
| DatasetNames: form.DatasetName, | |||
| WorkServerNumber: 1, | |||
| Spec: spec, | |||
| } | |||
| _, err = grampus.GenerateNotebook(ctx, req) | |||
| if err != nil { | |||
| log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) | |||
| grampusTrainJobNewDataPrepare(ctx, processType) | |||
| ctx.RenderWithErr(err.Error(), tpl, &form) | |||
| return | |||
| } | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") | |||
| } | |||
| func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| t := time.Now() | |||
| @@ -82,49 +221,67 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err | |||
| ctx.Data["display_job_name"] = displayJobName | |||
| //get valid images | |||
| images, err := grampus.GetImages(processType) | |||
| if processType == grampus.ProcessorTypeNPU { | |||
| images, err := grampus.GetImages(processType, string(models.JobTypeDebug)) | |||
| if err != nil { | |||
| log.Error("GetImages failed:", err.Error()) | |||
| } else { | |||
| ctx.Data["images"] = images.Infos | |||
| } | |||
| } | |||
| //prepare available specs | |||
| computeResourceSimple := models.GPU | |||
| datasetType := models.TypeCloudBrainOne | |||
| computeResource := models.GPUResource | |||
| if processType == grampus.ProcessorTypeNPU { | |||
| computeResourceSimple = models.NPU | |||
| datasetType = models.TypeCloudBrainTwo | |||
| computeResource = models.NPUResource | |||
| } | |||
| prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug) | |||
| //get branches | |||
| branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) | |||
| if err != nil { | |||
| log.Error("GetImages failed:", err.Error()) | |||
| log.Error("GetBranches error:", err.Error()) | |||
| } else { | |||
| ctx.Data["images"] = images.Infos | |||
| ctx.Data["branches"] = branches | |||
| } | |||
| grampus.InitSpecialPool() | |||
| ctx.Data["branchName"] = ctx.Repo.BranchName | |||
| ctx.Data["GPUEnabled"] = true | |||
| ctx.Data["NPUEnabled"] = true | |||
| includeCenters := make(map[string]struct{}) | |||
| excludeCenters := make(map[string]struct{}) | |||
| if grampus.SpecialPools != nil { | |||
| for _, pool := range grampus.SpecialPools.Pools { | |||
| if pool.IsExclusive { | |||
| if !IsUserInOrgPool(ctx.User.ID, pool) { | |||
| ctx.Data[pool.Type+"Enabled"] = false | |||
| } | |||
| } else { | |||
| if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) { | |||
| if IsUserInOrgPool(ctx.User.ID, pool) { | |||
| for _, center := range pool.Pool { | |||
| includeCenters[center.Queue] = struct{}{} | |||
| } | |||
| } else { | |||
| for _, center := range pool.Pool { | |||
| excludeCenters[center.Queue] = struct{}{} | |||
| } | |||
| ctx.Data["datasetType"] = datasetType | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug) | |||
| ctx.Data["WaitCount"] = waitCount | |||
| NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource) | |||
| ctx.Data["NotStopTaskCount"] = NotStopTaskCount | |||
| } | |||
| return nil | |||
| } | |||
| } | |||
| func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| } | |||
| t := time.Now() | |||
| var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] | |||
| ctx.Data["display_job_name"] = displayJobName | |||
| //get valid images | |||
| if processType == grampus.ProcessorTypeNPU { | |||
| images, err := grampus.GetImages(processType, string(models.JobTypeTrain)) | |||
| if err != nil { | |||
| log.Error("GetImages failed:", err.Error()) | |||
| } else { | |||
| ctx.Data["images"] = images.Infos | |||
| } | |||
| } | |||
| //prepare available specs | |||
| if processType == grampus.ProcessorTypeNPU { | |||
| prepareGrampusTrainSpecs(ctx, models.NPU) | |||
| prepareGrampusSpecs(ctx, models.NPU) | |||
| } else if processType == grampus.ProcessorTypeGPU { | |||
| prepareGrampusTrainSpecs(ctx, models.GPU) | |||
| prepareGrampusSpecs(ctx, models.GPU) | |||
| } | |||
| //get branches | |||
| @@ -203,55 +360,19 @@ func GrampusTrainJobVersionNew(ctx *context.Context) { | |||
| } | |||
| } | |||
| func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { | |||
| func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) { | |||
| tempJobType := models.JobTypeTrain | |||
| if len(jobType) > 0 { | |||
| tempJobType = jobType[0] | |||
| } | |||
| noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeTrain, | |||
| JobType: tempJobType, | |||
| ComputeResource: computeResource, | |||
| Cluster: models.C2NetCluster, | |||
| }) | |||
| ctx.Data["Specs"] = noteBookSpecs | |||
| } | |||
| func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { | |||
| if len(includeCenters) == 0 && len(excludeCenters) == 0 { | |||
| return specs.Infos | |||
| } | |||
| var grampusSpecs []models.GrampusSpec | |||
| for _, info := range specs.Infos { | |||
| if isInIncludeCenters(info, includeCenters) || (len(excludeCenters) != 0 && isNotAllInExcludeCenters(info, excludeCenters)) { | |||
| grampusSpecs = append(grampusSpecs, info) | |||
| } | |||
| } | |||
| return grampusSpecs | |||
| } | |||
| func isInIncludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { | |||
| for _, center := range grampusSpec.Centers { | |||
| if _, ok := centers[center.ID]; ok { | |||
| return true | |||
| } | |||
| } | |||
| return false | |||
| } | |||
| func isNotAllInExcludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { | |||
| for _, center := range grampusSpec.Centers { | |||
| if _, ok := centers[center.ID]; !ok { | |||
| return true | |||
| } | |||
| } | |||
| return false | |||
| } | |||
| func IsUserInOrgPool(userId int64, pool *models.SpecialPool) bool { | |||
| org, _ := models.GetOrgByName(pool.Org) | |||
| if org != nil { | |||
| isOrgMember, _ := models.IsOrganizationMember(org.ID, userId) | |||
| return isOrgMember | |||
| } | |||
| return false | |||
| } | |||
| func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { | |||
| if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") { | |||
| log.Error("the boot file(%s) must be a python file", form.BootFile) | |||
| @@ -1216,6 +1216,17 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| }) | |||
| }, context.RepoRef()) | |||
| m.Group("/grampus", func() { | |||
| m.Group("/notebook", func() { | |||
| m.Group("/:jobid", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) | |||
| m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) | |||
| m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) | |||
| }) | |||
| m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusNotebookNew) | |||
| m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusNotebookForm{}), repo.GrampusNotebookCreate) | |||
| }) | |||
| m.Group("/train-job", func() { | |||
| m.Group("/:jobid", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) | |||
| @@ -1288,16 +1299,6 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Group("/modelarts", func() { | |||
| m.Group("/notebook", func() { | |||
| /* v1.0 | |||
| m.Group("/:jobid", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||
| m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug) | |||
| m.Post("/:action", reqRepoCloudBrainWriter, repo.NotebookManage) | |||
| m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) | |||
| }) | |||
| m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) | |||
| m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsNotebookForm{}), repo.NotebookCreate) | |||
| */ | |||
| m.Group("/:id", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||
| m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug2) | |||
| @@ -62,6 +62,16 @@ var StatusInfoDict = map[string]StatusInfo{string(models.JobTypeDebug) + "-" + s | |||
| JobType: []models.JobType{models.JobTypeTrain}, | |||
| NotFinalStatuses: GrampusNotFinalStatuses, | |||
| ComputeResource: models.NPUResource, | |||
| }, string(models.JobTypeDebug) + "-" + strconv.Itoa(models.TypeC2Net) + "-" + models.GPUResource: { | |||
| CloudBrainTypes: []int{models.TypeC2Net}, | |||
| JobType: []models.JobType{models.JobTypeDebug}, | |||
| NotFinalStatuses: GrampusNotFinalStatuses, | |||
| ComputeResource: models.GPUResource, | |||
| }, string(models.JobTypeDebug) + "-" + strconv.Itoa(models.TypeC2Net) + "-" + models.NPUResource: { | |||
| CloudBrainTypes: []int{models.TypeC2Net}, | |||
| JobType: []models.JobType{models.JobTypeDebug}, | |||
| NotFinalStatuses: GrampusNotFinalStatuses, | |||
| ComputeResource: models.NPUResource, | |||
| }} | |||
| func GetNotFinalStatusTaskCount(uid int64, cloudbrainType int, jobType string, computeResource ...string) (int, error) { | |||