| @@ -24,6 +24,7 @@ type ModelArtsJobStatus string | |||||
| const ( | const ( | ||||
| TypeCloudBrainOne int = iota | TypeCloudBrainOne int = iota | ||||
| TypeCloudBrainTwo | TypeCloudBrainTwo | ||||
| TypeCloudBrainGrampus | |||||
| TypeCloudBrainAll = -1 | TypeCloudBrainAll = -1 | ||||
| ) | ) | ||||
| @@ -98,6 +99,14 @@ const ( | |||||
| ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 | ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 | ||||
| DURATION_STR_ZERO = "00:00:00" | DURATION_STR_ZERO = "00:00:00" | ||||
| //grampus | |||||
| GrampusStatusPending = "pending" | |||||
| GrampusStatusRunning = "running" | |||||
| GrampusStatusFailed = "failed" | |||||
| GrampusStatusSucceeded = "succeeded" | |||||
| GrampusStatusStopped = "stopped" | |||||
| GrampusStatusUnknown = "unknown" | |||||
| ) | ) | ||||
| type Cloudbrain struct { | type Cloudbrain struct { | ||||
| @@ -328,6 +337,7 @@ type CloudbrainsOptions struct { | |||||
| JobTypeNot bool | JobTypeNot bool | ||||
| NeedRepoInfo bool | NeedRepoInfo bool | ||||
| RepoIDList []int64 | RepoIDList []int64 | ||||
| ComputeResource string | |||||
| } | } | ||||
| type TaskPod struct { | type TaskPod struct { | ||||
| @@ -1150,6 +1160,44 @@ type LogFile struct { | |||||
| Name string | Name string | ||||
| } | } | ||||
| //Grampus | |||||
| type GrampusResult struct { | |||||
| ErrorCode int `json:"errorCode"` | |||||
| ErrorMsg string `json:"errorMsg"` | |||||
| } | |||||
| type GrampusJobInfo struct { | |||||
| StartedAt int64 `json:"startedAt"` | |||||
| RunSec int64 `json:"runSec"` | |||||
| CompletedAt int64 `json:"completedAt"` | |||||
| CreatedAt int64 `json:"createdAt"` | |||||
| UpdatedAt int64 `json:"updatedAt"` | |||||
| Desc string `json:"desc"` | |||||
| JobID string `json:"id"` | |||||
| Name string `json:"name"` | |||||
| Status string `json:"status"` | |||||
| UserID string `json:"userId"` | |||||
| Tasks []GrampusTasks `json:"tasks"` | |||||
| } | |||||
| type CreateGrampusJobResponse struct { | |||||
| GrampusResult | |||||
| JobInfo GrampusJobInfo `json:"otJob"` | |||||
| } | |||||
| type GrampusTasks struct { | |||||
| Command string `json:"command"` | |||||
| Name string `json:"name"` | |||||
| ImageId string `json:"imageId"` | |||||
| ResourceSpecId string `json:"resourceSpecId"` | |||||
| ImageUrl string `json:"imageUrl"` | |||||
| } | |||||
| type CreateGrampusJobRequest struct { | |||||
| Name string `json:"name"` | |||||
| Tasks []GrampusTasks `json:"tasks"` | |||||
| } | |||||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | ||||
| sess := x.NewSession() | sess := x.NewSession() | ||||
| defer sess.Close() | defer sess.Close() | ||||
| @@ -1179,6 +1227,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | |||||
| ) | ) | ||||
| } | } | ||||
| if len(opts.ComputeResource) >= 0 { | |||||
| cond = cond.And( | |||||
| builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, | |||||
| ) | |||||
| } | |||||
| if len(opts.JobTypes) > 0 { | if len(opts.JobTypes) > 0 { | ||||
| if opts.JobTypeNot { | if opts.JobTypeNot { | ||||
| cond = cond.And( | cond = cond.And( | ||||
| @@ -1589,6 +1643,11 @@ func GetCloudbrainInferenceJobCountByUserID(userID int64) (int, error) { | |||||
| return int(count), err | return int(count), err | ||||
| } | } | ||||
| func GetGrampusCountByUserID(userID int64, jobType, computeResource string) (int, error) { | |||||
| count, err := x.In("status", GrampusStatusPending, GrampusStatusRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainGrampus).And("compute_resource = ?", computeResource).Count(new(Cloudbrain)) | |||||
| return int(count), err | |||||
| } | |||||
| func UpdateInferenceJob(job *Cloudbrain) error { | func UpdateInferenceJob(job *Cloudbrain) error { | ||||
| return updateInferenceJob(x, job) | return updateInferenceJob(x, job) | ||||
| } | } | ||||
| @@ -0,0 +1,23 @@ | |||||
| package auth | |||||
| import ( | |||||
| "gitea.com/macaron/binding" | |||||
| "gitea.com/macaron/macaron" | |||||
| ) | |||||
| type CreateGrampusTrainJobForm struct { | |||||
| DisplayJobName string `form:"display_job_name" binding:"Required"` | |||||
| JobName string `form:"job_name" binding:"Required"` | |||||
| Attachment string `form:"attachment" binding:"Required"` | |||||
| BootFile string `form:"boot_file" binding:"Required"` | |||||
| Flavor string `form:"flavor" binding:"Required"` | |||||
| Params string `form:"run_para_list" binding:"Required"` | |||||
| Description string `form:"description"` | |||||
| BranchName string `form:"branch_name" binding:"Required"` | |||||
| FlavorName string `form:"flaver_names" binding:"Required"` | |||||
| EngineName string `form:"engine_names" binding:"Required"` | |||||
| } | |||||
| func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||||
| return validate(errs, ctx.Data, f, ctx.Locale) | |||||
| } | |||||
| @@ -48,8 +48,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er | |||||
| if !ctx.IsSigned { | if !ctx.IsSigned { | ||||
| return false | return false | ||||
| } | } | ||||
| log.Info("is repo owner:" + strconv.FormatBool(ctx.IsUserRepoOwner())) | |||||
| log.Info("is user admin:" + strconv.FormatBool(ctx.IsUserSiteAdmin())) | |||||
| if err != nil { | if err != nil { | ||||
| return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() | return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() | ||||
| @@ -1,13 +1,11 @@ | |||||
| package grampus | package grampus | ||||
| import ( | import ( | ||||
| "code.gitea.io/gitea/modules/timeutil" | |||||
| "strconv" | |||||
| "code.gitea.io/gitea/models" | "code.gitea.io/gitea/models" | ||||
| "code.gitea.io/gitea/modules/context" | "code.gitea.io/gitea/modules/context" | ||||
| "code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
| "code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
| "code.gitea.io/gitea/modules/timeutil" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| @@ -21,19 +19,6 @@ const ( | |||||
| NotebookType = "Ascend" | NotebookType = "Ascend" | ||||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | ||||
| //train-job | |||||
| // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||||
| // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||||
| // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | |||||
| // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | |||||
| // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | |||||
| // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | |||||
| // "]}" | |||||
| // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||||
| // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||||
| // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||||
| // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||||
| // "]}" | |||||
| CodePath = "/code/" | CodePath = "/code/" | ||||
| OutputPath = "/output/" | OutputPath = "/output/" | ||||
| ResultPath = "/result/" | ResultPath = "/result/" | ||||
| @@ -65,7 +50,12 @@ var ( | |||||
| ) | ) | ||||
| type GenerateTrainJobReq struct { | type GenerateTrainJobReq struct { | ||||
| JobName string | |||||
| JobName string | |||||
| Command string | |||||
| ResourceSpecId string | |||||
| ImageUrl string | |||||
| ImageId string | |||||
| DisplayJobName string | DisplayJobName string | ||||
| Uuid string | Uuid string | ||||
| Description string | Description string | ||||
| @@ -74,15 +64,10 @@ type GenerateTrainJobReq struct { | |||||
| BootFileUrl string | BootFileUrl string | ||||
| DataUrl string | DataUrl string | ||||
| TrainUrl string | TrainUrl string | ||||
| FlavorCode string | |||||
| LogUrl string | |||||
| PoolID string | |||||
| WorkServerNumber int | WorkServerNumber int | ||||
| EngineID int64 | EngineID int64 | ||||
| Parameters []models.Parameter | |||||
| CommitID string | CommitID string | ||||
| IsLatestVersion string | IsLatestVersion string | ||||
| Params string | |||||
| BranchName string | BranchName string | ||||
| PreVersionId int64 | PreVersionId int64 | ||||
| PreVersionName string | PreVersionName string | ||||
| @@ -90,139 +75,54 @@ type GenerateTrainJobReq struct { | |||||
| VersionCount int | VersionCount int | ||||
| EngineName string | EngineName string | ||||
| TotalVersionCount int | TotalVersionCount int | ||||
| } | |||||
| type GenerateInferenceJobReq struct { | |||||
| JobName string | |||||
| DisplayJobName string | |||||
| Uuid string | |||||
| Description string | |||||
| CodeObsPath string | |||||
| BootFile string | |||||
| BootFileUrl string | |||||
| DataUrl string | |||||
| TrainUrl string | |||||
| FlavorCode string | |||||
| LogUrl string | |||||
| PoolID string | |||||
| WorkServerNumber int | |||||
| EngineID int64 | |||||
| Parameters []models.Parameter | |||||
| CommitID string | |||||
| Params string | |||||
| BranchName string | |||||
| FlavorName string | |||||
| EngineName string | |||||
| LabelName string | |||||
| IsLatestVersion string | |||||
| VersionCount int | |||||
| TotalVersionCount int | |||||
| ModelName string | |||||
| ModelVersion string | |||||
| CkptName string | |||||
| ResultUrl string | |||||
| } | |||||
| type VersionInfo struct { | |||||
| Version []struct { | |||||
| ID int `json:"id"` | |||||
| Value string `json:"value"` | |||||
| } `json:"version"` | |||||
| } | |||||
| type Flavor struct { | |||||
| Info []struct { | |||||
| Code string `json:"code"` | |||||
| Value string `json:"value"` | |||||
| } `json:"flavor"` | |||||
| } | |||||
| type Engine struct { | |||||
| Info []struct { | |||||
| ID int `json:"id"` | |||||
| Value string `json:"value"` | |||||
| } `json:"engine"` | |||||
| } | |||||
| type ResourcePool struct { | |||||
| Info []struct { | |||||
| ID string `json:"id"` | |||||
| Value string `json:"value"` | |||||
| } `json:"resource_pool"` | |||||
| } | |||||
| // type Parameter struct { | |||||
| // Label string `json:"label"` | |||||
| // Value string `json:"value"` | |||||
| // } | |||||
| // type Parameters struct { | |||||
| // Parameter []Parameter `json:"parameter"` | |||||
| // } | |||||
| type Parameters struct { | |||||
| Parameter []struct { | |||||
| Label string `json:"label"` | |||||
| Value string `json:"value"` | |||||
| } `json:"parameter"` | |||||
| ComputeResource string | |||||
| DatasetName string | |||||
| } | } | ||||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { | func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { | ||||
| createTime := timeutil.TimeStampNow() | createTime := timeutil.TimeStampNow() | ||||
| jobResult, err := createTrainJob(models.CreateTrainJobParams{ | |||||
| JobName: req.JobName, | |||||
| Description: req.Description, | |||||
| Config: models.Config{ | |||||
| WorkServerNum: req.WorkServerNumber, | |||||
| AppUrl: req.CodeObsPath, | |||||
| BootFileUrl: req.BootFileUrl, | |||||
| DataUrl: req.DataUrl, | |||||
| EngineID: req.EngineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| LogUrl: req.LogUrl, | |||||
| PoolID: req.PoolID, | |||||
| CreateVersion: true, | |||||
| Flavor: models.Flavor{ | |||||
| Code: req.FlavorCode, | |||||
| jobResult, err := createJob(models.CreateGrampusJobRequest{ | |||||
| Name: req.JobName, | |||||
| Tasks: []models.GrampusTasks{ | |||||
| { | |||||
| Name: req.JobName, | |||||
| Command: req.Command, | |||||
| ResourceSpecId: req.ResourceSpecId, | |||||
| ImageId: req.ImageId, | |||||
| ImageUrl: req.ImageUrl, | |||||
| }, | }, | ||||
| Parameter: req.Parameters, | |||||
| }, | }, | ||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("CreateJob failed: %v", err.Error()) | |||||
| log.Error("createJob failed: %v", err.Error()) | |||||
| return err | return err | ||||
| } | } | ||||
| attach, err := models.GetAttachmentByUUID(req.Uuid) | |||||
| if err != nil { | |||||
| log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) | |||||
| return err | |||||
| } | |||||
| jobId := strconv.FormatInt(jobResult.JobID, 10) | |||||
| jobID := jobResult.JobInfo.JobID | |||||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | err = models.CreateCloudbrain(&models.Cloudbrain{ | ||||
| Status: TransTrainJobStatus(jobResult.Status), | |||||
| UserID: ctx.User.ID, | |||||
| RepoID: ctx.Repo.Repository.ID, | |||||
| JobID: jobId, | |||||
| JobName: req.JobName, | |||||
| DisplayJobName: req.DisplayJobName, | |||||
| JobType: string(models.JobTypeTrain), | |||||
| Type: models.TypeCloudBrainTwo, | |||||
| VersionID: jobResult.VersionID, | |||||
| VersionName: jobResult.VersionName, | |||||
| Uuid: req.Uuid, | |||||
| DatasetName: attach.Name, | |||||
| CommitID: req.CommitID, | |||||
| IsLatestVersion: req.IsLatestVersion, | |||||
| ComputeResource: models.NPUResource, | |||||
| EngineID: req.EngineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| BranchName: req.BranchName, | |||||
| Parameters: req.Params, | |||||
| BootFile: req.BootFile, | |||||
| DataUrl: req.DataUrl, | |||||
| LogUrl: req.LogUrl, | |||||
| FlavorCode: req.FlavorCode, | |||||
| Status: string(models.GrampusStatusPending), | |||||
| UserID: ctx.User.ID, | |||||
| RepoID: ctx.Repo.Repository.ID, | |||||
| JobID: jobID, | |||||
| JobName: req.JobName, | |||||
| DisplayJobName: req.DisplayJobName, | |||||
| JobType: string(models.JobTypeTrain), | |||||
| Type: models.TypeCloudBrainGrampus, | |||||
| //VersionID: jobResult.VersionID, | |||||
| //VersionName: jobResult.VersionName, | |||||
| Uuid: req.Uuid, | |||||
| DatasetName: req.DatasetName, | |||||
| CommitID: req.CommitID, | |||||
| //IsLatestVersion: req.IsLatestVersion, | |||||
| ComputeResource: req.ComputeResource, | |||||
| //EngineID: req.EngineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| BranchName: req.BranchName, | |||||
| //Parameters: req.Params, | |||||
| BootFile: req.BootFile, | |||||
| DataUrl: req.DataUrl, | |||||
| //LogUrl: req.LogUrl, | |||||
| //FlavorCode: req.FlavorCode, | |||||
| Description: req.Description, | Description: req.Description, | ||||
| WorkServerNumber: req.WorkServerNumber, | WorkServerNumber: req.WorkServerNumber, | ||||
| FlavorName: req.FlavorName, | FlavorName: req.FlavorName, | ||||
| @@ -237,58 +137,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||||
| log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) | log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) | ||||
| return err | return err | ||||
| } | } | ||||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) | |||||
| return nil | |||||
| } | |||||
| func TransTrainJobStatus(status int) string { | |||||
| switch status { | |||||
| case 0: | |||||
| return "UNKNOWN" | |||||
| case 1: | |||||
| return "INIT" | |||||
| case 2: | |||||
| return "IMAGE_CREATING" | |||||
| case 3: | |||||
| return "IMAGE_FAILED" | |||||
| case 4: | |||||
| return "SUBMIT_TRYING" | |||||
| case 5: | |||||
| return "SUBMIT_FAILED" | |||||
| case 6: | |||||
| return "DELETE_FAILED" | |||||
| case 7: | |||||
| return "WAITING" | |||||
| case 8: | |||||
| return "RUNNING" | |||||
| case 9: | |||||
| return "KILLING" | |||||
| case 10: | |||||
| return "COMPLETED" | |||||
| case 11: | |||||
| return "FAILED" | |||||
| case 12: | |||||
| return "KILLED" | |||||
| case 13: | |||||
| return "CANCELED" | |||||
| case 14: | |||||
| return "LOST" | |||||
| case 15: | |||||
| return "SCALING" | |||||
| case 16: | |||||
| return "SUBMIT_MODEL_FAILED" | |||||
| case 17: | |||||
| return "DEPLOY_SERVICE_FAILED" | |||||
| case 18: | |||||
| return "CHECK_INIT" | |||||
| case 19: | |||||
| return "CHECK_RUNNING" | |||||
| case 20: | |||||
| return "CHECK_RUNNING_COMPLETED" | |||||
| case 21: | |||||
| return "CHECK_FAILED" | |||||
| default: | |||||
| return strconv.Itoa(status) | |||||
| var actionType models.ActionType | |||||
| if req.ComputeResource == models.NPUResource { | |||||
| actionType = models.ActionCreateTrainTask | |||||
| } else if req.ComputeResource == models.GPUResource { | |||||
| actionType = models.ActionCreateGPUTrainTask | |||||
| } | } | ||||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType) | |||||
| return nil | |||||
| } | } | ||||
| @@ -23,19 +23,18 @@ const ( | |||||
| urlOpenApiV1 = "/openapi/v1/" | urlOpenApiV1 = "/openapi/v1/" | ||||
| urlGetToken = urlOpenApiV1 + "token" | urlGetToken = urlOpenApiV1 + "token" | ||||
| urlNotebook = "/demanager/instances" | |||||
| urlTrainJob = "/training-jobs" | |||||
| urlTrainJob = urlOpenApiV1 + "trainjob" | |||||
| urlResourceSpecs = "/job/resource-specs" | urlResourceSpecs = "/job/resource-specs" | ||||
| urlTrainJobConfig = "/training-job-configs" | urlTrainJobConfig = "/training-job-configs" | ||||
| errorCodeExceedLimit = "ModelArts.0118" | errorCodeExceedLimit = "ModelArts.0118" | ||||
| urlNotebook2 = "" | urlNotebook2 = "" | ||||
| modelartsIllegalToken = "" | |||||
| errorIllegalToken = 1005 | |||||
| ) | ) | ||||
| type GetTokenParams struct { | type GetTokenParams struct { | ||||
| UserName string `json:"user_name"` | |||||
| UserName string `json:"username"` | |||||
| Password string `json:"password"` | Password string `json:"password"` | ||||
| } | } | ||||
| @@ -92,44 +91,34 @@ func getToken() error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||||
| func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) { | |||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| var result models.CreateNotebookResult | |||||
| var result models.CreateGrampusJobResponse | |||||
| retry := 0 | retry := 0 | ||||
| sendjob: | sendjob: | ||||
| res, err := client.R(). | |||||
| _, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | SetHeader("Content-Type", "application/json"). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetBody(createJobParams). | |||||
| SetBody(req). | |||||
| SetResult(&result). | SetResult(&result). | ||||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) | |||||
| Post(HOST + urlTrainJob) | |||||
| if err != nil { | if err != nil { | ||||
| return nil, fmt.Errorf("resty create notebook: %s", err) | |||||
| return nil, fmt.Errorf("resty CreateJob: %s", err) | |||||
| } | } | ||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| if result.ErrorCode == errorIllegalToken && retry < 1 { | |||||
| retry++ | retry++ | ||||
| _ = getToken() | _ = getToken() | ||||
| goto sendjob | goto sendjob | ||||
| } | } | ||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == errorCodeExceedLimit { | |||||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | |||||
| } | |||||
| return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if result.ErrorCode != 0 { | |||||
| log.Error("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | } | ||||
| return &result, nil | return &result, nil | ||||
| @@ -147,7 +136,7 @@ sendjob: | |||||
| SetHeader("Content-Type", "application/json"). | SetHeader("Content-Type", "application/json"). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) | |||||
| if err != nil { | if err != nil { | ||||
| return nil, fmt.Errorf("resty GetJob: %v", err) | return nil, fmt.Errorf("resty GetJob: %v", err) | ||||
| @@ -174,217 +163,6 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetNotebook2Result | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.NotebookActionResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetBody(param). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action") | |||||
| if err != nil { | |||||
| return &result, fmt.Errorf("resty StopJob: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.NotebookActionResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) | |||||
| if err != nil { | |||||
| return &result, fmt.Errorf("resty ManageNotebook2: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func DelNotebook(jobID string) (*models.NotebookDelResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.NotebookDelResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||||
| if err != nil { | |||||
| return &result, fmt.Errorf("resty DelJob: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func DelNotebook2(jobID string) (*models.NotebookDelResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.NotebookDelResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) | |||||
| if err != nil { | |||||
| return &result, fmt.Errorf("resty DelJob: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func DelJob(jobID string) (*models.NotebookDelResult, error) { | func DelJob(jobID string) (*models.NotebookDelResult, error) { | ||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| @@ -397,7 +175,7 @@ sendjob: | |||||
| SetHeader("Content-Type", "application/json"). | SetHeader("Content-Type", "application/json"). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) | |||||
| Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) | |||||
| if err != nil { | if err != nil { | ||||
| return &result, fmt.Errorf("resty DelJob: %v", err) | return &result, fmt.Errorf("resty DelJob: %v", err) | ||||
| @@ -424,45 +202,6 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.NotebookGetJobTokenResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token") | |||||
| if err != nil { | |||||
| return &result, fmt.Errorf("resty GetJobToken: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { | func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { | ||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| @@ -519,61 +258,6 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.CreateTrainJobResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(createJobVersionParams). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty create train-job version: %s", err) | |||||
| } | |||||
| req, _ := json.Marshal(createJobVersionParams) | |||||
| log.Info("%s", req) | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." | |||||
| DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." | |||||
| if temp.ErrorMsg == BootFileErrorMsg { | |||||
| log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("启动文件错误!") | |||||
| } | |||||
| if temp.ErrorMsg == DataSetErrorMsg { | |||||
| log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("数据集错误!") | |||||
| } | |||||
| return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | ||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| @@ -616,145 +300,6 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.CreateTrainJobConfigResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(req). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| //temp, _ := json.Marshal(req) | |||||
| //log.Info("%s", temp) | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetConfigListResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetQueryParams(map[string]string{ | |||||
| "per_page": strconv.Itoa(perPage), | |||||
| "page": strconv.Itoa(page), | |||||
| "sortBy": sortBy, | |||||
| "order": order, | |||||
| "search_content": searchContent, | |||||
| "config_type": configType, | |||||
| }). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty GetConfigList: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetConfigResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetQueryParams(map[string]string{ | |||||
| "config_type": configType, | |||||
| }). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) | |||||
| if err != nil { | |||||
| return result, fmt.Errorf("resty GetParaConfig: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return result, nil | |||||
| } | |||||
| func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { | func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { | ||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| @@ -1062,51 +607,3 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.CreateNotebookResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(createJobParams). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty create notebook2: %s", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| var response models.NotebookResult | |||||
| err = json.Unmarshal(res.Body(), &response) | |||||
| if err != nil { | |||||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||||
| } | |||||
| if len(response.ErrorCode) != 0 { | |||||
| log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == errorCodeExceedLimit { | |||||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | |||||
| } | |||||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| @@ -3,9 +3,11 @@ package repo | |||||
| import ( | import ( | ||||
| "code.gitea.io/gitea/modules/auth" | "code.gitea.io/gitea/modules/auth" | ||||
| "code.gitea.io/gitea/modules/git" | "code.gitea.io/gitea/modules/git" | ||||
| "code.gitea.io/gitea/modules/grampus" | |||||
| "code.gitea.io/gitea/modules/modelarts" | "code.gitea.io/gitea/modules/modelarts" | ||||
| "code.gitea.io/gitea/modules/util" | "code.gitea.io/gitea/modules/util" | ||||
| "encoding/json" | "encoding/json" | ||||
| "errors" | |||||
| "io/ioutil" | "io/ioutil" | ||||
| "net/http" | "net/http" | ||||
| "os" | "os" | ||||
| @@ -149,7 +151,7 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error { | |||||
| } | } | ||||
| func GrampusTrainJobNPUNew(ctx *context.Context) { | func GrampusTrainJobNPUNew(ctx *context.Context) { | ||||
| err := trainJobNpuNewDataPrepare(ctx) | |||||
| err := grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| if err != nil { | if err != nil { | ||||
| ctx.ServerError("get new train-job info failed", err) | ctx.ServerError("get new train-job info failed", err) | ||||
| return | return | ||||
| @@ -157,7 +159,7 @@ func GrampusTrainJobNPUNew(ctx *context.Context) { | |||||
| ctx.HTML(200, tplGrampusTrainJobNPUNew) | ctx.HTML(200, tplGrampusTrainJobNPUNew) | ||||
| } | } | ||||
| func trainJobNpuNewDataPrepare(ctx *context.Context) error { | |||||
| func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error { | |||||
| ctx.Data["PageIsCloudBrain"] = true | ctx.Data["PageIsCloudBrain"] = true | ||||
| t := time.Now() | t := time.Now() | ||||
| @@ -215,110 +217,122 @@ func trainJobNpuNewDataPrepare(ctx *context.Context) error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { | |||||
| func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { | |||||
| if !strings.HasSuffix(form.BootFile, ".py") { | |||||
| log.Error("the boot file(%s) must be a python file", form.BootFile) | |||||
| return errors.New("启动文件必须是python文件") | |||||
| } | |||||
| if form.BranchName == "" { | |||||
| log.Error("the branch must not be null!", form.BranchName) | |||||
| return errors.New("代码分支不能为空!") | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { | |||||
| VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) | VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) | ||||
| displayJobName := form.DisplayJobName | displayJobName := form.DisplayJobName | ||||
| jobName := util.ConvertDisplayJobNameToJobName(displayJobName) | jobName := util.ConvertDisplayJobNameToJobName(displayJobName) | ||||
| //todo:del | |||||
| jobName = displayJobName | |||||
| uuid := form.Attachment | uuid := form.Attachment | ||||
| description := form.Description | description := form.Description | ||||
| workServerNumber := form.WorkServerNumber | |||||
| engineID := form.EngineID | |||||
| bootFile := form.BootFile | bootFile := form.BootFile | ||||
| flavorCode := form.Flavor | |||||
| params := form.Params | params := form.Params | ||||
| poolID := form.PoolID | |||||
| isSaveParam := form.IsSaveParam | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | ||||
| codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | ||||
| outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" | |||||
| logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" | |||||
| dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" | dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" | ||||
| branch_name := form.BranchName | |||||
| branchName := form.BranchName | |||||
| isLatestVersion := modelarts.IsLatestVersion | isLatestVersion := modelarts.IsLatestVersion | ||||
| FlavorName := form.FlavorName | FlavorName := form.FlavorName | ||||
| VersionCount := modelarts.VersionCount | VersionCount := modelarts.VersionCount | ||||
| EngineName := form.EngineName | EngineName := form.EngineName | ||||
| count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) | |||||
| log.Info(jobName) | |||||
| count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) | |||||
| log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } else { | } else { | ||||
| if count >= 1 { | if count >= 1 { | ||||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| if err := paramCheckCreateTrainJob(form); err != nil { | |||||
| if err := grampusParamCheckCreateTrainJob(form); err != nil { | |||||
| log.Error("paramCheckCreateTrainJob failed:(%v)", err) | log.Error("paramCheckCreateTrainJob failed:(%v)", err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| //Determine whether the task name of the task in the project is duplicated | |||||
| //check whether the task name in the project is duplicated | |||||
| tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) | tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) | ||||
| if err == nil { | if err == nil { | ||||
| if len(tasks) != 0 { | if len(tasks) != 0 { | ||||
| log.Error("the job name did already exist", ctx.Data["MsgID"]) | log.Error("the job name did already exist", ctx.Data["MsgID"]) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| } else { | } else { | ||||
| if !models.IsErrJobNotExist(err) { | if !models.IsErrJobNotExist(err) { | ||||
| log.Error("system error, %v", err, ctx.Data["MsgID"]) | log.Error("system error, %v", err, ctx.Data["MsgID"]) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| //todo: del the codeLocalPath | |||||
| //prepare code and out path | |||||
| _, err = ioutil.ReadDir(codeLocalPath) | _, err = ioutil.ReadDir(codeLocalPath) | ||||
| if err == nil { | if err == nil { | ||||
| os.RemoveAll(codeLocalPath) | os.RemoveAll(codeLocalPath) | ||||
| } | } | ||||
| gitRepo, _ := git.OpenRepository(repo.RepoPath()) | gitRepo, _ := git.OpenRepository(repo.RepoPath()) | ||||
| commitID, _ := gitRepo.GetBranchCommitID(branch_name) | |||||
| commitID, _ := gitRepo.GetBranchCommitID(branchName) | |||||
| if err := downloadCode(repo, codeLocalPath, branch_name); err != nil { | |||||
| if err := downloadCode(repo, codeLocalPath, branchName); err != nil { | |||||
| log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) | log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| //todo: upload code (send to file_server todo this work?) | //todo: upload code (send to file_server todo this work?) | ||||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { | if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { | ||||
| log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) | log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { | if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { | ||||
| log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) | log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| // parentDir := VersionOutputPath + "/" | |||||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | ||||
| // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { | // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { | ||||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| //prepare command | |||||
| //todo: download code, download dataset, unzip dataset, exec code, upload model | |||||
| var parameters models.Parameters | var parameters models.Parameters | ||||
| param := make([]models.Parameter, 0) | param := make([]models.Parameter, 0) | ||||
| existDeviceTarget := false | existDeviceTarget := false | ||||
| @@ -326,8 +340,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||||
| err := json.Unmarshal([]byte(params), ¶meters) | err := json.Unmarshal([]byte(params), ¶meters) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("Failed to Unmarshal params: %s (%v)", params, err) | log.Error("Failed to Unmarshal params: %s (%v)", params, err) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| @@ -350,67 +364,32 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||||
| }) | }) | ||||
| } | } | ||||
| //save param config | |||||
| if isSaveParam == "on" { | |||||
| saveparams := append(param, models.Parameter{ | |||||
| Label: modelarts.TrainUrl, | |||||
| Value: outputObsPath, | |||||
| }, models.Parameter{ | |||||
| Label: modelarts.DataUrl, | |||||
| Value: dataPath, | |||||
| }) | |||||
| if form.ParameterTemplateName == "" { | |||||
| log.Error("ParameterTemplateName is empty") | |||||
| trainJobNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) | |||||
| return | |||||
| } | |||||
| _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ | |||||
| ConfigName: form.ParameterTemplateName, | |||||
| Description: form.PrameterDescription, | |||||
| DataUrl: dataPath, | |||||
| AppUrl: codeObsPath, | |||||
| BootFileUrl: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| Flavor: models.Flavor{ | |||||
| Code: flavorCode, | |||||
| }, | |||||
| WorkServerNum: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Parameter: saveparams, | |||||
| }) | |||||
| if err != nil { | |||||
| log.Error("Failed to CreateTrainJobConfig: %v", err) | |||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| return | |||||
| } | |||||
| } | |||||
| req := &modelarts.GenerateTrainJobReq{ | |||||
| JobName: jobName, | |||||
| DisplayJobName: displayJobName, | |||||
| DataUrl: dataPath, | |||||
| Description: description, | |||||
| CodeObsPath: codeObsPath, | |||||
| BootFileUrl: codeObsPath + bootFile, | |||||
| BootFile: bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| FlavorCode: flavorCode, | |||||
| WorkServerNumber: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Uuid: uuid, | |||||
| Parameters: param, | |||||
| CommitID: commitID, | |||||
| IsLatestVersion: isLatestVersion, | |||||
| BranchName: branch_name, | |||||
| Params: form.Params, | |||||
| req := &grampus.GenerateTrainJobReq{ | |||||
| JobName: jobName, | |||||
| DisplayJobName: displayJobName, | |||||
| ComputeResource: models.NPUResource, | |||||
| Command: "echo \"test\"", | |||||
| ResourceSpecId: "modelarts.kat1.xlarge", | |||||
| ImageUrl: "", | |||||
| ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64", | |||||
| DataUrl: dataPath, | |||||
| Description: description, | |||||
| CodeObsPath: codeObsPath, | |||||
| BootFileUrl: codeObsPath + bootFile, | |||||
| BootFile: bootFile, | |||||
| //TrainUrl: outputObsPath, | |||||
| //FlavorCode: flavorCode, | |||||
| WorkServerNumber: 1, | |||||
| //EngineID: int64(engineID), | |||||
| //LogUrl: logObsPath, | |||||
| //PoolID: poolID, | |||||
| Uuid: uuid, | |||||
| //Parameters: param, | |||||
| CommitID: commitID, | |||||
| IsLatestVersion: isLatestVersion, | |||||
| BranchName: branchName, | |||||
| //Params: form.Params, | |||||
| FlavorName: FlavorName, | FlavorName: FlavorName, | ||||
| EngineName: EngineName, | EngineName: EngineName, | ||||
| VersionCount: VersionCount, | VersionCount: VersionCount, | ||||
| @@ -424,11 +403,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra | |||||
| return | return | ||||
| } | } | ||||
| err = modelarts.GenerateTrainJob(ctx, req) | |||||
| err = grampus.GenerateTrainJob(ctx, req) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("GenerateTrainJob failed:%v", err.Error()) | log.Error("GenerateTrainJob failed:%v", err.Error()) | ||||
| trainJobErrorNewDataPrepare(ctx, form) | |||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| grampusTrainJobNpuNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | ||||
| @@ -559,24 +559,11 @@ func TrainJobIndex(ctx *context.Context) { | |||||
| } | } | ||||
| listType := ctx.Query("listType") | listType := ctx.Query("listType") | ||||
| if len(listType) == 0 { | |||||
| listType = models.AllResource | |||||
| } | |||||
| ctx.Data["ListType"] = listType | ctx.Data["ListType"] = listType | ||||
| typeCloudBrain := models.TypeCloudBrainAll | |||||
| if listType == models.GPUResource { | |||||
| typeCloudBrain = models.TypeCloudBrainOne | |||||
| } else if listType == models.NPUResource { | |||||
| typeCloudBrain = models.TypeCloudBrainTwo | |||||
| } else if listType == models.AllResource { | |||||
| typeCloudBrain = models.TypeCloudBrainAll | |||||
| if listType == models.AllResource { | |||||
| listType = "" | |||||
| } | } | ||||
| //else { | |||||
| // log.Error("listType(%s) error", listType) | |||||
| // ctx.ServerError("listType error", errors.New("listType error")) | |||||
| // return | |||||
| //} | |||||
| var jobTypes []string | var jobTypes []string | ||||
| jobTypes = append(jobTypes, string(models.JobTypeTrain)) | jobTypes = append(jobTypes, string(models.JobTypeTrain)) | ||||
| @@ -586,10 +573,10 @@ func TrainJobIndex(ctx *context.Context) { | |||||
| PageSize: setting.UI.IssuePagingNum, | PageSize: setting.UI.IssuePagingNum, | ||||
| }, | }, | ||||
| RepoID: repo.ID, | RepoID: repo.ID, | ||||
| Type: typeCloudBrain, | |||||
| JobTypeNot: false, | JobTypeNot: false, | ||||
| JobTypes: jobTypes, | JobTypes: jobTypes, | ||||
| IsLatestVersion: modelarts.IsLatestVersion, | IsLatestVersion: modelarts.IsLatestVersion, | ||||
| ComputeResource: listType, | |||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| ctx.ServerError("Cloudbrain", err) | ctx.ServerError("Cloudbrain", err) | ||||
| @@ -1103,7 +1103,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||||
| m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) | m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) | ||||
| }) | }) | ||||
| m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) | m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) | ||||
| //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) | |||||
| m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate) | |||||
| }) | }) | ||||
| }) | }) | ||||
| }, context.RepoRef()) | }, context.RepoRef()) | ||||