| @@ -24,6 +24,7 @@ type ModelArtsJobStatus string | |||
| const ( | |||
| TypeCloudBrainOne int = iota | |||
| TypeCloudBrainTwo | |||
| TypeCloudBrainGrampus | |||
| TypeCloudBrainAll = -1 | |||
| ) | |||
| @@ -98,6 +99,14 @@ const ( | |||
| ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 | |||
| DURATION_STR_ZERO = "00:00:00" | |||
| //grampus | |||
| GrampusStatusPending = "pending" | |||
| GrampusStatusRunning = "running" | |||
| GrampusStatusFailed = "failed" | |||
| GrampusStatusSucceeded = "succeeded" | |||
| GrampusStatusStopped = "stopped" | |||
| GrampusStatusUnknown = "unknown" | |||
| ) | |||
| type Cloudbrain struct { | |||
| @@ -328,6 +337,7 @@ type CloudbrainsOptions struct { | |||
| JobTypeNot bool | |||
| NeedRepoInfo bool | |||
| RepoIDList []int64 | |||
| ComputeResource string | |||
| } | |||
| type TaskPod struct { | |||
| @@ -1150,6 +1160,44 @@ type LogFile struct { | |||
| Name string | |||
| } | |||
| //Grampus | |||
| type GrampusResult struct { | |||
| ErrorCode int `json:"errorCode"` | |||
| ErrorMsg string `json:"errorMsg"` | |||
| } | |||
| type GrampusJobInfo struct { | |||
| StartedAt int64 `json:"startedAt"` | |||
| RunSec int64 `json:"runSec"` | |||
| CompletedAt int64 `json:"completedAt"` | |||
| CreatedAt int64 `json:"createdAt"` | |||
| UpdatedAt int64 `json:"updatedAt"` | |||
| Desc string `json:"desc"` | |||
| JobID string `json:"id"` | |||
| Name string `json:"name"` | |||
| Status string `json:"status"` | |||
| UserID string `json:"userId"` | |||
| Tasks []GrampusTasks `json:"tasks"` | |||
| } | |||
| type CreateGrampusJobResponse struct { | |||
| GrampusResult | |||
| JobInfo GrampusJobInfo `json:"otJob"` | |||
| } | |||
| type GrampusTasks struct { | |||
| Command string `json:"command"` | |||
| Name string `json:"name"` | |||
| ImageId string `json:"imageId"` | |||
| ResourceSpecId string `json:"resourceSpecId"` | |||
| ImageUrl string `json:"imageUrl"` | |||
| } | |||
| type CreateGrampusJobRequest struct { | |||
| Name string `json:"name"` | |||
| Tasks []GrampusTasks `json:"tasks"` | |||
| } | |||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| @@ -1179,6 +1227,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | |||
| ) | |||
| } | |||
| if len(opts.ComputeResource) >= 0 { | |||
| cond = cond.And( | |||
| builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, | |||
| ) | |||
| } | |||
| if len(opts.JobTypes) > 0 { | |||
| if opts.JobTypeNot { | |||
| cond = cond.And( | |||
| @@ -1589,6 +1643,11 @@ func GetCloudbrainInferenceJobCountByUserID(userID int64) (int, error) { | |||
| return int(count), err | |||
| } | |||
| func GetGrampusCountByUserID(userID int64, jobType, computeResource string) (int, error) { | |||
| count, err := x.In("status", GrampusStatusPending, GrampusStatusRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainGrampus).And("compute_resource = ?", computeResource).Count(new(Cloudbrain)) | |||
| return int(count), err | |||
| } | |||
| func UpdateInferenceJob(job *Cloudbrain) error { | |||
| return updateInferenceJob(x, job) | |||
| } | |||
| @@ -0,0 +1,23 @@ | |||
| package auth | |||
| import ( | |||
| "gitea.com/macaron/binding" | |||
| "gitea.com/macaron/macaron" | |||
| ) | |||
| type CreateGrampusTrainJobForm struct { | |||
| DisplayJobName string `form:"display_job_name" binding:"Required"` | |||
| JobName string `form:"job_name" binding:"Required"` | |||
| Attachment string `form:"attachment" binding:"Required"` | |||
| BootFile string `form:"boot_file" binding:"Required"` | |||
| Flavor string `form:"flavor" binding:"Required"` | |||
| Params string `form:"run_para_list" binding:"Required"` | |||
| Description string `form:"description"` | |||
| BranchName string `form:"branch_name" binding:"Required"` | |||
| FlavorName string `form:"flaver_names" binding:"Required"` | |||
| EngineName string `form:"engine_names" binding:"Required"` | |||
| } | |||
| func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||
| return validate(errs, ctx.Data, f, ctx.Locale) | |||
| } | |||
| @@ -48,8 +48,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er | |||
| if !ctx.IsSigned { | |||
| return false | |||
| } | |||
| log.Info("is repo owner:" + strconv.FormatBool(ctx.IsUserRepoOwner())) | |||
| log.Info("is user admin:" + strconv.FormatBool(ctx.IsUserSiteAdmin())) | |||
| if err != nil { | |||
| return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() | |||
| @@ -1,13 +1,11 @@ | |||
| package grampus | |||
| import ( | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "strconv" | |||
| "code.gitea.io/gitea/models" | |||
| "code.gitea.io/gitea/modules/context" | |||
| "code.gitea.io/gitea/modules/log" | |||
| "code.gitea.io/gitea/modules/notification" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| ) | |||
| const ( | |||
| @@ -21,19 +19,6 @@ const ( | |||
| NotebookType = "Ascend" | |||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||
| //train-job | |||
| // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||
| // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||
| // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | |||
| // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | |||
| // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | |||
| // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | |||
| // "]}" | |||
| // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||
| // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||
| // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||
| // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||
| // "]}" | |||
| CodePath = "/code/" | |||
| OutputPath = "/output/" | |||
| ResultPath = "/result/" | |||
| @@ -65,7 +50,12 @@ var ( | |||
| ) | |||
| type GenerateTrainJobReq struct { | |||
| JobName string | |||
| JobName string | |||
| Command string | |||
| ResourceSpecId string | |||
| ImageUrl string | |||
| ImageId string | |||
| DisplayJobName string | |||
| Uuid string | |||
| Description string | |||
| @@ -74,15 +64,10 @@ type GenerateTrainJobReq struct { | |||
| BootFileUrl string | |||
| DataUrl string | |||
| TrainUrl string | |||
| FlavorCode string | |||
| LogUrl string | |||
| PoolID string | |||
| WorkServerNumber int | |||
| EngineID int64 | |||
| Parameters []models.Parameter | |||
| CommitID string | |||
| IsLatestVersion string | |||
| Params string | |||
| BranchName string | |||
| PreVersionId int64 | |||
| PreVersionName string | |||
| @@ -90,139 +75,54 @@ type GenerateTrainJobReq struct { | |||
| VersionCount int | |||
| EngineName string | |||
| TotalVersionCount int | |||
| } | |||
| type GenerateInferenceJobReq struct { | |||
| JobName string | |||
| DisplayJobName string | |||
| Uuid string | |||
| Description string | |||
| CodeObsPath string | |||
| BootFile string | |||
| BootFileUrl string | |||
| DataUrl string | |||
| TrainUrl string | |||
| FlavorCode string | |||
| LogUrl string | |||
| PoolID string | |||
| WorkServerNumber int | |||
| EngineID int64 | |||
| Parameters []models.Parameter | |||
| CommitID string | |||
| Params string | |||
| BranchName string | |||
| FlavorName string | |||
| EngineName string | |||
| LabelName string | |||
| IsLatestVersion string | |||
| VersionCount int | |||
| TotalVersionCount int | |||
| ModelName string | |||
| ModelVersion string | |||
| CkptName string | |||
| ResultUrl string | |||
| } | |||
| type VersionInfo struct { | |||
| Version []struct { | |||
| ID int `json:"id"` | |||
| Value string `json:"value"` | |||
| } `json:"version"` | |||
| } | |||
| type Flavor struct { | |||
| Info []struct { | |||
| Code string `json:"code"` | |||
| Value string `json:"value"` | |||
| } `json:"flavor"` | |||
| } | |||
| type Engine struct { | |||
| Info []struct { | |||
| ID int `json:"id"` | |||
| Value string `json:"value"` | |||
| } `json:"engine"` | |||
| } | |||
| type ResourcePool struct { | |||
| Info []struct { | |||
| ID string `json:"id"` | |||
| Value string `json:"value"` | |||
| } `json:"resource_pool"` | |||
| } | |||
| // type Parameter struct { | |||
| // Label string `json:"label"` | |||
| // Value string `json:"value"` | |||
| // } | |||
| // type Parameters struct { | |||
| // Parameter []Parameter `json:"parameter"` | |||
| // } | |||
| type Parameters struct { | |||
| Parameter []struct { | |||
| Label string `json:"label"` | |||
| Value string `json:"value"` | |||
| } `json:"parameter"` | |||
| ComputeResource string | |||
| DatasetName string | |||
| } | |||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| jobResult, err := createTrainJob(models.CreateTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.Config{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| jobResult, err := createJob(models.CreateGrampusJobRequest{ | |||
| Name: req.JobName, | |||
| Tasks: []models.GrampusTasks{ | |||
| { | |||
| Name: req.JobName, | |||
| Command: req.Command, | |||
| ResourceSpecId: req.ResourceSpecId, | |||
| ImageId: req.ImageId, | |||
| ImageUrl: req.ImageUrl, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed: %v", err.Error()) | |||
| log.Error("createJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| attach, err := models.GetAttachmentByUUID(req.Uuid) | |||
| if err != nil { | |||
| log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) | |||
| return err | |||
| } | |||
| jobId := strconv.FormatInt(jobResult.JobID, 10) | |||
| jobID := jobResult.JobInfo.JobID | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.Status), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobId, | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeTrain), | |||
| Type: models.TypeCloudBrainTwo, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: attach.Name, | |||
| CommitID: req.CommitID, | |||
| IsLatestVersion: req.IsLatestVersion, | |||
| ComputeResource: models.NPUResource, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| BranchName: req.BranchName, | |||
| Parameters: req.Params, | |||
| BootFile: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| LogUrl: req.LogUrl, | |||
| FlavorCode: req.FlavorCode, | |||
| Status: string(models.GrampusStatusPending), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeTrain), | |||
| Type: models.TypeCloudBrainGrampus, | |||
| //VersionID: jobResult.VersionID, | |||
| //VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: req.DatasetName, | |||
| CommitID: req.CommitID, | |||
| //IsLatestVersion: req.IsLatestVersion, | |||
| ComputeResource: req.ComputeResource, | |||
| //EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| BranchName: req.BranchName, | |||
| //Parameters: req.Params, | |||
| BootFile: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| //LogUrl: req.LogUrl, | |||
| //FlavorCode: req.FlavorCode, | |||
| Description: req.Description, | |||
| WorkServerNumber: req.WorkServerNumber, | |||
| FlavorName: req.FlavorName, | |||
| @@ -237,58 +137,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) | |||
| return err | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) | |||
| return nil | |||
| } | |||
| func TransTrainJobStatus(status int) string { | |||
| switch status { | |||
| case 0: | |||
| return "UNKNOWN" | |||
| case 1: | |||
| return "INIT" | |||
| case 2: | |||
| return "IMAGE_CREATING" | |||
| case 3: | |||
| return "IMAGE_FAILED" | |||
| case 4: | |||
| return "SUBMIT_TRYING" | |||
| case 5: | |||
| return "SUBMIT_FAILED" | |||
| case 6: | |||
| return "DELETE_FAILED" | |||
| case 7: | |||
| return "WAITING" | |||
| case 8: | |||
| return "RUNNING" | |||
| case 9: | |||
| return "KILLING" | |||
| case 10: | |||
| return "COMPLETED" | |||
| case 11: | |||
| return "FAILED" | |||
| case 12: | |||
| return "KILLED" | |||
| case 13: | |||
| return "CANCELED" | |||
| case 14: | |||
| return "LOST" | |||
| case 15: | |||
| return "SCALING" | |||
| case 16: | |||
| return "SUBMIT_MODEL_FAILED" | |||
| case 17: | |||
| return "DEPLOY_SERVICE_FAILED" | |||
| case 18: | |||
| return "CHECK_INIT" | |||
| case 19: | |||
| return "CHECK_RUNNING" | |||
| case 20: | |||
| return "CHECK_RUNNING_COMPLETED" | |||
| case 21: | |||
| return "CHECK_FAILED" | |||
| default: | |||
| return strconv.Itoa(status) | |||
| var actionType models.ActionType | |||
| if req.ComputeResource == models.NPUResource { | |||
| actionType = models.ActionCreateTrainTask | |||
| } else if req.ComputeResource == models.GPUResource { | |||
| actionType = models.ActionCreateGPUTrainTask | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType) | |||
| return nil | |||
| } | |||
| @@ -23,19 +23,18 @@ const ( | |||
| urlOpenApiV1 = "/openapi/v1/" | |||
| urlGetToken = urlOpenApiV1 + "token" | |||
| urlNotebook = "/demanager/instances" | |||
| urlTrainJob = "/training-jobs" | |||
| urlTrainJob = urlOpenApiV1 + "trainjob" | |||
| urlResourceSpecs = "/job/resource-specs" | |||
| urlTrainJobConfig = "/training-job-configs" | |||
| errorCodeExceedLimit = "ModelArts.0118" | |||
| urlNotebook2 = "" | |||
| modelartsIllegalToken = "" | |||
| errorIllegalToken = 1005 | |||
| ) | |||
| type GetTokenParams struct { | |||
| UserName string `json:"user_name"` | |||
| UserName string `json:"username"` | |||
| Password string `json:"password"` | |||
| } | |||
| @@ -92,44 +91,34 @@ func getToken() error { | |||
| return nil | |||
| } | |||
| func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||
| func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateNotebookResult | |||
| var result models.CreateGrampusJobResponse | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| _, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(createJobParams). | |||
| SetBody(req). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) | |||
| Post(HOST + urlTrainJob) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty create notebook: %s", err) | |||
| return nil, fmt.Errorf("resty CreateJob: %s", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| if result.ErrorCode == errorIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == errorCodeExceedLimit { | |||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | |||
| } | |||
| return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if result.ErrorCode != 0 { | |||
| log.Error("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| @@ -147,7 +136,7 @@ sendjob: | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||
| @@ -174,217 +163,6 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetNotebook2Result | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookActionResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetBody(param). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action") | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty StopJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookActionResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty ManageNotebook2: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func DelNotebook(jobID string) (*models.NotebookDelResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookDelResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty DelJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func DelNotebook2(jobID string) (*models.NotebookDelResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookDelResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty DelJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func DelJob(jobID string) (*models.NotebookDelResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -397,7 +175,7 @@ sendjob: | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||
| Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty DelJob: %v", err) | |||
| @@ -424,45 +202,6 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.NotebookGetJobTokenResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token") | |||
| if err != nil { | |||
| return &result, fmt.Errorf("resty GetJobToken: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -519,61 +258,6 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateTrainJobResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(createJobVersionParams). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty create train-job version: %s", err) | |||
| } | |||
| req, _ := json.Marshal(createJobVersionParams) | |||
| log.Info("%s", req) | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." | |||
| DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." | |||
| if temp.ErrorMsg == BootFileErrorMsg { | |||
| log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("启动文件错误!") | |||
| } | |||
| if temp.ErrorMsg == DataSetErrorMsg { | |||
| log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("数据集错误!") | |||
| } | |||
| return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -616,145 +300,6 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateTrainJobConfigResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(req). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| //temp, _ := json.Marshal(req) | |||
| //log.Info("%s", temp) | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetConfigListResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetQueryParams(map[string]string{ | |||
| "per_page": strconv.Itoa(perPage), | |||
| "page": strconv.Itoa(page), | |||
| "sortBy": sortBy, | |||
| "order": order, | |||
| "search_content": searchContent, | |||
| "config_type": configType, | |||
| }). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetConfigList: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetConfigResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetQueryParams(map[string]string{ | |||
| "config_type": configType, | |||
| }). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) | |||
| if err != nil { | |||
| return result, fmt.Errorf("resty GetParaConfig: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return result, nil | |||
| } | |||
| func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -1062,51 +607,3 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateNotebookResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(createJobParams). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty create notebook2: %s", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| var response models.NotebookResult | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == errorCodeExceedLimit { | |||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | |||
| } | |||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -3,9 +3,11 @@ package repo | |||
| import ( | |||
| "code.gitea.io/gitea/modules/auth" | |||
| "code.gitea.io/gitea/modules/git" | |||
| "code.gitea.io/gitea/modules/grampus" | |||
| "code.gitea.io/gitea/modules/modelarts" | |||
| "code.gitea.io/gitea/modules/util" | |||
| "encoding/json" | |||
| "errors" | |||
| "io/ioutil" | |||
| "net/http" | |||
| "os" | |||
| @@ -149,7 +151,7 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| func GrampusTrainJobNPUNew(ctx *context.Context) { | |||
| err := trainJobNpuNewDataPrepare(ctx) | |||
| err := grampusTrainJobNpuNewDataPrepare(ctx) | |||
| if err != nil { | |||
| ctx.ServerError("get new train-job info failed", err) | |||
| return | |||
| @@ -157,7 +159,7 @@ func GrampusTrainJobNPUNew(ctx *context.Context) { | |||
| ctx.HTML(200, tplGrampusTrainJobNPUNew) | |||
| } | |||
| func trainJobNpuNewDataPrepare(ctx *context.Context) error { | |||
| func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| t := time.Now() | |||
| @@ -215,110 +217,122 @@ func trainJobNpuNewDataPrepare(ctx *context.Context) error { | |||
| return nil | |||
| } | |||
| func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { | |||
| func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { | |||
| if !strings.HasSuffix(form.BootFile, ".py") { | |||
| log.Error("the boot file(%s) must be a python file", form.BootFile) | |||
| return errors.New("启动文件必须是python文件") | |||
| } | |||
| if form.BranchName == "" { | |||
| log.Error("the branch must not be null!", form.BranchName) | |||
| return errors.New("代码分支不能为空!") | |||
| } | |||
| return nil | |||
| } | |||
| func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { | |||
| VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) | |||
| displayJobName := form.DisplayJobName | |||
| jobName := util.ConvertDisplayJobNameToJobName(displayJobName) | |||
| //todo:del | |||
| jobName = displayJobName | |||
| uuid := form.Attachment | |||
| description := form.Description | |||
| workServerNumber := form.WorkServerNumber | |||
| engineID := form.EngineID | |||
| bootFile := form.BootFile | |||
| flavorCode := form.Flavor | |||
| params := form.Params | |||
| poolID := form.PoolID | |||
| isSaveParam := form.IsSaveParam | |||
| repo := ctx.Repo.Repository | |||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | |||
| codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | |||
| outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" | |||
| logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" | |||
| dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" | |||
| branch_name := form.BranchName | |||
| branchName := form.BranchName | |||
| isLatestVersion := modelarts.IsLatestVersion | |||
| FlavorName := form.FlavorName | |||
| VersionCount := modelarts.VersionCount | |||
| EngineName := form.EngineName | |||
| count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) | |||
| log.Info(jobName) | |||
| count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) | |||
| log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| } | |||
| if err := paramCheckCreateTrainJob(form); err != nil { | |||
| if err := grampusParamCheckCreateTrainJob(form); err != nil { | |||
| log.Error("paramCheckCreateTrainJob failed:(%v)", err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| //Determine whether the task name of the task in the project is duplicated | |||
| //check whether the task name in the project is duplicated | |||
| tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) | |||
| if err == nil { | |||
| if len(tasks) != 0 { | |||
| log.Error("the job name did already exist", ctx.Data["MsgID"]) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| } else { | |||
| if !models.IsErrJobNotExist(err) { | |||
| log.Error("system error, %v", err, ctx.Data["MsgID"]) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| } | |||
| //todo: del the codeLocalPath | |||
| //prepare code and out path | |||
| _, err = ioutil.ReadDir(codeLocalPath) | |||
| if err == nil { | |||
| os.RemoveAll(codeLocalPath) | |||
| } | |||
| gitRepo, _ := git.OpenRepository(repo.RepoPath()) | |||
| commitID, _ := gitRepo.GetBranchCommitID(branch_name) | |||
| commitID, _ := gitRepo.GetBranchCommitID(branchName) | |||
| if err := downloadCode(repo, codeLocalPath, branch_name); err != nil { | |||
| if err := downloadCode(repo, codeLocalPath, branchName); err != nil { | |||
| log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| //todo: upload code (send to file_server todo this work?) | |||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { | |||
| log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { | |||
| log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| // parentDir := VersionOutputPath + "/" | |||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | |||
| // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { | |||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| //prepare command | |||
| //todo: download code, download dataset, unzip dataset, exec code, upload model | |||
| var parameters models.Parameters | |||
| param := make([]models.Parameter, 0) | |||
| existDeviceTarget := false | |||
| @@ -326,8 +340,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||
| err := json.Unmarshal([]byte(params), ¶meters) | |||
| if err != nil { | |||
| log.Error("Failed to Unmarshal params: %s (%v)", params, err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| @@ -350,67 +364,32 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||
| }) | |||
| } | |||
| //save param config | |||
| if isSaveParam == "on" { | |||
| saveparams := append(param, models.Parameter{ | |||
| Label: modelarts.TrainUrl, | |||
| Value: outputObsPath, | |||
| }, models.Parameter{ | |||
| Label: modelarts.DataUrl, | |||
| Value: dataPath, | |||
| }) | |||
| if form.ParameterTemplateName == "" { | |||
| log.Error("ParameterTemplateName is empty") | |||
| trainJobNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ | |||
| ConfigName: form.ParameterTemplateName, | |||
| Description: form.PrameterDescription, | |||
| DataUrl: dataPath, | |||
| AppUrl: codeObsPath, | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| TrainUrl: outputObsPath, | |||
| Flavor: models.Flavor{ | |||
| Code: flavorCode, | |||
| }, | |||
| WorkServerNum: workServerNumber, | |||
| EngineID: int64(engineID), | |||
| LogUrl: logObsPath, | |||
| PoolID: poolID, | |||
| Parameter: saveparams, | |||
| }) | |||
| if err != nil { | |||
| log.Error("Failed to CreateTrainJobConfig: %v", err) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| } | |||
| req := &modelarts.GenerateTrainJobReq{ | |||
| JobName: jobName, | |||
| DisplayJobName: displayJobName, | |||
| DataUrl: dataPath, | |||
| Description: description, | |||
| CodeObsPath: codeObsPath, | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| BootFile: bootFile, | |||
| TrainUrl: outputObsPath, | |||
| FlavorCode: flavorCode, | |||
| WorkServerNumber: workServerNumber, | |||
| EngineID: int64(engineID), | |||
| LogUrl: logObsPath, | |||
| PoolID: poolID, | |||
| Uuid: uuid, | |||
| Parameters: param, | |||
| CommitID: commitID, | |||
| IsLatestVersion: isLatestVersion, | |||
| BranchName: branch_name, | |||
| Params: form.Params, | |||
| req := &grampus.GenerateTrainJobReq{ | |||
| JobName: jobName, | |||
| DisplayJobName: displayJobName, | |||
| ComputeResource: models.NPUResource, | |||
| Command: "echo \"test\"", | |||
| ResourceSpecId: "modelarts.kat1.xlarge", | |||
| ImageUrl: "", | |||
| ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64", | |||
| DataUrl: dataPath, | |||
| Description: description, | |||
| CodeObsPath: codeObsPath, | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| BootFile: bootFile, | |||
| //TrainUrl: outputObsPath, | |||
| //FlavorCode: flavorCode, | |||
| WorkServerNumber: 1, | |||
| //EngineID: int64(engineID), | |||
| //LogUrl: logObsPath, | |||
| //PoolID: poolID, | |||
| Uuid: uuid, | |||
| //Parameters: param, | |||
| CommitID: commitID, | |||
| IsLatestVersion: isLatestVersion, | |||
| BranchName: branchName, | |||
| //Params: form.Params, | |||
| FlavorName: FlavorName, | |||
| EngineName: EngineName, | |||
| VersionCount: VersionCount, | |||
| @@ -424,11 +403,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||
| return | |||
| } | |||
| err = modelarts.GenerateTrainJob(ctx, req) | |||
| err = grampus.GenerateTrainJob(ctx, req) | |||
| if err != nil { | |||
| log.Error("GenerateTrainJob failed:%v", err.Error()) | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) | |||
| return | |||
| } | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | |||
| @@ -559,24 +559,11 @@ func TrainJobIndex(ctx *context.Context) { | |||
| } | |||
| listType := ctx.Query("listType") | |||
| if len(listType) == 0 { | |||
| listType = models.AllResource | |||
| } | |||
| ctx.Data["ListType"] = listType | |||
| typeCloudBrain := models.TypeCloudBrainAll | |||
| if listType == models.GPUResource { | |||
| typeCloudBrain = models.TypeCloudBrainOne | |||
| } else if listType == models.NPUResource { | |||
| typeCloudBrain = models.TypeCloudBrainTwo | |||
| } else if listType == models.AllResource { | |||
| typeCloudBrain = models.TypeCloudBrainAll | |||
| if listType == models.AllResource { | |||
| listType = "" | |||
| } | |||
| //else { | |||
| // log.Error("listType(%s) error", listType) | |||
| // ctx.ServerError("listType error", errors.New("listType error")) | |||
| // return | |||
| //} | |||
| var jobTypes []string | |||
| jobTypes = append(jobTypes, string(models.JobTypeTrain)) | |||
| @@ -586,10 +573,10 @@ func TrainJobIndex(ctx *context.Context) { | |||
| PageSize: setting.UI.IssuePagingNum, | |||
| }, | |||
| RepoID: repo.ID, | |||
| Type: typeCloudBrain, | |||
| JobTypeNot: false, | |||
| JobTypes: jobTypes, | |||
| IsLatestVersion: modelarts.IsLatestVersion, | |||
| ComputeResource: listType, | |||
| }) | |||
| if err != nil { | |||
| ctx.ServerError("Cloudbrain", err) | |||
| @@ -1103,7 +1103,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) | |||
| }) | |||
| m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) | |||
| //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) | |||
| m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate) | |||
| }) | |||
| }) | |||
| }, context.RepoRef()) | |||