From 53bb13abb57af2e69db2492d0476c5453fff6549 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Fri, 9 Sep 2022 08:39:11 +0800 Subject: [PATCH 01/12] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E9=80=89=E6=8B=A9=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/dataset.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/models/dataset.go b/models/dataset.go index 4cff4d6d1..720850ed9 100755 --- a/models/dataset.go +++ b/models/dataset.go @@ -131,13 +131,17 @@ func (datasets DatasetList) loadAttachmentAttributes(opts *SearchDatasetOptions) permission = false datasets[i].Repo.GetOwner() if !permission { - isCollaborator, _ := datasets[i].Repo.IsCollaborator(opts.User.ID) - isInRepoTeam,_:=datasets[i].Repo.IsInRepoTeam(opts.User.ID) - - if isCollaborator ||isInRepoTeam { - log.Info("Collaborator user may visit the attach.") + if datasets[i].Repo.OwnerID==opts.User.ID{ permission = true + }else{ + isCollaborator, _ := datasets[i].Repo.IsCollaborator(opts.User.ID) + isInRepoTeam,_:=datasets[i].Repo.IsInRepoTeam(opts.User.ID) + + if isCollaborator ||isInRepoTeam { + permission = true + } } + } permissionMap[datasets[i].ID] = permission From e54de5eb15243ad2ff26a65c4e49ffcdb3ea48e4 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Tue, 13 Sep 2022 11:39:28 +0800 Subject: [PATCH 02/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 71 +++++++++----- modules/auth/cloudbrain.go | 5 + modules/auth/grampus.go | 5 + modules/auth/modelarts.go | 5 + modules/cloudbrain/cloudbrain.go | 14 +++ modules/grampus/grampus.go | 73 +++++++++------ routers/repo/cloudbrain.go | 13 +++ routers/repo/grampus.go | 155 ++++++++++++++++++++++++------- 8 files changed, 254 insertions(+), 87 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index dc56efef7..12c76ce57 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "path" "strconv" "strings" "time" @@ -170,24 +171,25 @@ type Cloudbrain struct { ImageID string //grampus image_id AiCenter string //grampus ai center: center_id+center_name - TrainUrl string //输出模型的obs路径 - BranchName string //分支名称 - Parameters string //传给modelarts的param参数 - BootFile string //启动文件 - DataUrl string //数据集的obs路径 - LogUrl string //日志输出的obs路径 - PreVersionId int64 //父版本的版本id - FlavorCode string //modelarts上的规格id - Description string `xorm:"varchar(256)"` //描述 - WorkServerNumber int //节点数 - FlavorName string //规格名称 - EngineName string //引擎名称 - TotalVersionCount int //任务的所有版本数量,包括删除的 - LabelName string //标签名称 - ModelName string //模型名称 - ModelVersion string //模型版本 - CkptName string //权重文件名称 - ResultUrl string //推理结果的obs路径 + TrainUrl string //输出模型的obs路径 + BranchName string //分支名称 + Parameters string //传给modelarts的param参数 + BootFile string //启动文件 + DataUrl string //数据集的obs路径 + LogUrl string //日志输出的obs路径 + PreVersionId int64 //父版本的版本id + FlavorCode string //modelarts上的规格id + Description string `xorm:"varchar(256)"` //描述 + WorkServerNumber int //节点数 + FlavorName string //规格名称 + EngineName string //引擎名称 + TotalVersionCount int //任务的所有版本数量,包括删除的 + LabelName string //标签名称 + ModelName string //模型名称 + ModelVersion string //模型版本 + CkptName string //权重文件名称 + PreTrainingModelUrl string //预训练模型地址 + ResultUrl string //推理结果的obs路径 User *User `xorm:"-"` Repo *Repository `xorm:"-"` @@ -603,6 +605,16 @@ type ResourceSpec struct { ShareMemMiB int `json:"shareMemMiB"` } +type FlavorInfos struct { + FlavorInfo []*FlavorInfo `json:"flavor_info"` +} + +type FlavorInfo struct { + Id int `json:"id"` + Value string `json:"value"` + Desc string `json:"desc"` +} + type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } @@ -2223,9 +2235,10 @@ func CloudbrainAllStatic(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, er type DatasetInfo struct { DataLocalPath string Name string + FullName string } -func GetDatasetInfo(uuidStr string) (map[string]DatasetInfo, string, error) { +func GetDatasetInfo(uuidStr string, grampusType ...string) (map[string]DatasetInfo, string, error) { var datasetNames string uuids := strings.Split(uuidStr, ";") if len(uuids) > setting.MaxDatasetNum { @@ -2258,16 +2271,26 @@ func GetDatasetInfo(uuidStr string) (map[string]DatasetInfo, string, error) { return nil, datasetNames, errors.New("the dataset name is same") } } + var dataLocalPath string + if len(grampusType) > 0 { + if grampusType[0] == GPU { + dataLocalPath = setting.Attachment.Minio.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + } else { + dataLocalPath = setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + "/" + } - dataLocalPath := setting.Attachment.Minio.RealPath + - setting.Attachment.Minio.Bucket + "/" + - setting.Attachment.Minio.BasePath + - AttachmentRelativePath(attach.UUID) + - attach.UUID + } else { + dataLocalPath = setting.Attachment.Minio.RealPath + + setting.Attachment.Minio.Bucket + "/" + + setting.Attachment.Minio.BasePath + + AttachmentRelativePath(attach.UUID) + + attach.UUID + } datasetInfos[attach.UUID] = DatasetInfo{ DataLocalPath: dataLocalPath, Name: fileName, + FullName: attach.Name, } if i == 0 { datasetNames = attach.Name diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 5bd294f2a..48e23efac 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -23,6 +23,11 @@ type CreateCloudBrainForm struct { BootFile string `form:"boot_file"` Params string `form:"run_para_list"` BranchName string `form:"branch_name"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` DatasetName string `form:"dataset_name"` SpecId int64 `form:"spec_id"` } diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 21008ea09..414a7c25d 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -18,6 +18,11 @@ type CreateGrampusTrainJobForm struct { WorkServerNumber int `form:"work_server_number" binding:"Required"` Image string `form:"image"` DatasetName string `form:"dataset_name"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` SpecId int64 `form:"spec_id"` } diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index 23e1f325a..ced5ea1e8 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -48,6 +48,11 @@ type CreateModelArtsTrainJobForm struct { FlavorName string `form:"flaver_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"` SpecId int64 `form:"spec_id" binding:"Required"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` } type CreateModelArtsInferenceJobForm struct { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 748af4a29..5a4d2fe05 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -24,6 +24,7 @@ const ( CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" + PretrainModelMountPath = "/pretrainmodel" LogFile = "log.txt" BenchMarkMountPath = "/benchmark" BenchMarkResourceID = 1 @@ -77,6 +78,8 @@ type GenerateCloudBrainTaskReq struct { ModelVersion string CkptName string LabelName string + PreTrainModelPath string + PreTrainingModelUrl string Spec *models.Specification } @@ -276,6 +279,16 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { }, } + if req.PreTrainingModelUrl != "" { //预训练 + volumes = append(volumes, models.Volume{ + HostPath: models.StHostPath{ + Path: req.PreTrainModelPath, + MountPath: PretrainModelMountPath, + ReadOnly: true, + }, + }) + } + if len(req.DatasetInfos) == 1 { volumes = append(volumes, models.Volume{ HostPath: models.StHostPath{ @@ -359,6 +372,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { CkptName: req.CkptName, ResultUrl: req.ResultPath, LabelName: req.LabelName, + PreTrainingModelUrl: req.PreTrainingModelUrl, CreatedUnix: createTime, UpdatedUnix: createTime, CommitID: req.CommitID, diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 687fb4959..f434a484c 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -62,9 +62,17 @@ type GenerateTrainJobReq struct { TotalVersionCount int ComputeResource string ProcessType string - DatasetName string - Params string - Spec *models.Specification + + DatasetNames string + DatasetInfos map[string]models.DatasetInfo + Params string + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelPath string + PreTrainingModelUrl string + Spec *models.Specification } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { @@ -94,33 +102,38 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.JobInfo.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeC2Net, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: req.ComputeResource, - ImageID: req.ImageId, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - Description: req.Description, - WorkServerNumber: req.WorkServerNumber, - EngineName: req.EngineName, - VersionCount: req.VersionCount, - TotalVersionCount: req.TotalVersionCount, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: req.Spec, + Status: TransTrainJobStatus(jobResult.JobInfo.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeC2Net, + Uuid: req.Uuid, + DatasetName: req.DatasetNames, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: req.ComputeResource, + ImageID: req.ImageId, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainingModelUrl: req.PreTrainingModelUrl, + CkptName: req.CkptName, }) if err != nil { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 7020f0a61..56a485b66 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -328,6 +328,16 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelPath = setting.Attachment.Minio.RealPath + form.PreTrainModelUrl + req.PreTrainingModelUrl = form.PreTrainModelUrl + + } + err = cloudbrain.GenerateTask(req) if err != nil { cloudBrainNewDataPrepare(ctx) @@ -2629,6 +2639,9 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { param += " --" + parameter.Label + "=" + parameter.Value } } + if form.CkptName != "" { + param += " --pretrainmodelname" + "=" + form.CkptName + } command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index d7e799427..b32070a84 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -210,7 +210,6 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" - dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid branchName := form.BranchName image := strings.TrimSpace(form.Image) @@ -290,11 +289,12 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //check dataset - attachment, err := models.GetAttachmentByUUID(uuid) + + datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU) if err != nil { - log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) + log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplGrampusTrainJobGPUNew, &form) return } @@ -336,8 +336,22 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain return } + var datasetRemotePath, allFileName string + for _, datasetInfo := range datasetInfos { + if datasetRemotePath == "" { + datasetRemotePath = datasetInfo.DataLocalPath + allFileName = datasetInfo.FullName + } else { + datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + allFileName = allFileName + ";" + datasetInfo.FullName + } + + } + //prepare command - command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name) + preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) + + command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) @@ -348,26 +362,37 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ - JobName: jobName, - DisplayJobName: displayJobName, - ComputeResource: models.GPUResource, - ProcessType: grampus.ProcessorTypeGPU, - Command: command, - ImageUrl: image, - Description: description, - BootFile: bootFile, - Uuid: uuid, - CommitID: commitID, - BranchName: branchName, - Params: form.Params, - EngineName: image, - DatasetName: attachment.Name, + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: models.GPUResource, + ProcessType: grampus.ProcessorTypeGPU, + Command: command, + ImageUrl: image, + Description: description, + BootFile: bootFile, + Uuid: uuid, + CommitID: commitID, + BranchName: branchName, + Params: form.Params, + EngineName: image, + DatasetNames: datasetNames, + DatasetInfos: datasetInfos, + IsLatestVersion: modelarts.IsLatestVersion, VersionCount: modelarts.VersionCountOne, WorkServerNumber: 1, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainingModelUrl = form.PreTrainModelUrl + + } + err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) @@ -378,6 +403,17 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func getPreTrainModelPath(pretrainModelDir string, fileName string) string { + index := strings.Index(pretrainModelDir, "/") + if index > 0 { + filterBucket := pretrainModelDir[index+1:] + return filterBucket + fileName + } else { + return "" + } + +} + func checkSpecialPool(ctx *context.Context, resourceType string) string { grampus.InitSpecialPool() if grampus.SpecialPools != nil { @@ -410,7 +446,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := grampus.JobPath + jobName + modelarts.CodePath - dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" + //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion versionCount := modelarts.VersionCountOne @@ -492,11 +528,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //check dataset - attachment, err := models.GetAttachmentByUUID(uuid) + datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU) if err != nil { - log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) + log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplGrampusTrainJobNPUNew, &form) return } @@ -528,8 +564,21 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain return } + var datasetRemotePath, allFileName string + for _, datasetInfo := range datasetInfos { + if datasetRemotePath == "" { + datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" + allFileName = datasetInfo.FullName + } else { + datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" + allFileName = allFileName + ";" + datasetInfo.FullName + } + + } + //prepare command - command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+"'"+attachment.Name+"'", bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) + preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) + command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) @@ -546,7 +595,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ProcessType: grampus.ProcessorTypeNPU, Command: command, ImageId: form.ImageID, - DataUrl: dataObsPath, Description: description, CodeObsPath: codeObsPath, BootFileUrl: codeObsPath + bootFile, @@ -560,9 +608,18 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, - DatasetName: attachment.Name, + DatasetNames: datasetNames, + DatasetInfos: datasetInfos, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainingModelUrl = form.PreTrainModelUrl + + } err = grampus.GenerateTrainJob(ctx, req) if err != nil { @@ -776,7 +833,7 @@ func GrampusGetLog(ctx *context.Context) { return } -func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName string) (string, error) { +func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) { var command string workDir := grampus.NpuWorkDir @@ -788,18 +845,18 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo //download code & dataset if processorType == grampus.ProcessorTypeNPU { commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" + commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) command += commandDownload } else if processorType == grampus.ProcessorTypeGPU { - commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" + commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "'" + commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) command += commandDownload } //unzip code & dataset - toolUnzip := "unzip -q '" - if strings.HasSuffix(datasetName, ".tar.gz") { - toolUnzip = "tar -zxvf '" - } - commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + toolUnzip + datasetName + "';" + unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) + + commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand command += commandUnzip command += "echo \"unzip finished;start to exec code;\";" @@ -859,6 +916,38 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo return command, nil } +func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string { + commandDownloadTemp := commandDownload + if pretrainModelPath != "" { + commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'" + } + commandDownloadTemp += ";" + return commandDownloadTemp +} + +func generateDatasetUnzipCommand(datasetName string) string { + var unZipDatasetCommand string + + datasetNameArray := strings.Split(datasetName, ";") + if len(datasetNameArray) == 1 { //单数据集 + unZipDatasetCommand = "unzip -q '" + datasetName + "';" + if strings.HasSuffix(datasetName, ".tar.gz") { + unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';" + } + + } else { //多数据集 + for _, datasetNameTemp := range datasetNameArray { + if strings.HasSuffix(datasetName, ".tar.gz") { + unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetName + "';" + } else { + unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';" + } + } + + } + return unZipDatasetCommand +} + func downloadZipCode(ctx *context.Context, codePath, branchName string) error { archiveType := git.ZIP archivePath := codePath From 9df8fa9f24032d1cae07f8f034f5dc233a2f126d Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Tue, 13 Sep 2022 14:48:34 +0800 Subject: [PATCH 03/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/modelarts/modelarts.go | 132 ++++++++++++++++++--------------- routers/repo/modelarts.go | 16 ++++ 2 files changed, 87 insertions(+), 61 deletions(-) diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 4539699ad..ead824b60 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -75,35 +75,40 @@ var ( ) type GenerateTrainJobReq struct { - JobName string - DisplayJobName string - Uuid string - Description string - CodeObsPath string - BootFile string - BootFileUrl string - DataUrl string - TrainUrl string - LogUrl string - PoolID string - WorkServerNumber int - EngineID int64 - Parameters []models.Parameter - CommitID string - IsLatestVersion string - Params string - BranchName string - PreVersionId int64 - PreVersionName string - FlavorCode string - FlavorName string - VersionCount int - EngineName string - TotalVersionCount int - UserImageUrl string - UserCommand string - DatasetName string - Spec *models.Specification + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + PreVersionId int64 + PreVersionName string + FlavorCode string + FlavorName string + VersionCount int + EngineName string + TotalVersionCount int + UserImageUrl string + UserCommand string + DatasetName string + Spec *models.Specification + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainingModelUrl string } type GenerateInferenceJobReq struct { @@ -407,38 +412,43 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error } jobId := strconv.FormatInt(jobResult.JobID, 10) createErr = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobId, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: models.NPUResource, - EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - LogUrl: req.LogUrl, - FlavorCode: req.Spec.SourceSpecId, - Description: req.Description, - WorkServerNumber: req.WorkServerNumber, - FlavorName: req.FlavorName, - EngineName: req.EngineName, - VersionCount: req.VersionCount, - TotalVersionCount: req.TotalVersionCount, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: req.Spec, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobId, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: models.NPUResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.Spec.SourceSpecId, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainingModelUrl: req.PreTrainingModelUrl, + CkptName: req.CkptName, }) if createErr != nil { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index b4f6f000e..121c4fd78 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1290,6 +1290,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) Value: string(jsondatas), }) } + if form.ModelName != "" { //使用预训练模型训练 + ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName + param = append(param, models.Parameter{ + Label: modelarts.CkptUrl, + Value: "s3:/" + ckptUrl, + }) + } //save param config // if isSaveParam == "on" { @@ -1358,6 +1365,15 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) DatasetName: datasetNames, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainingModelUrl = form.PreTrainModelUrl + + } + userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand req.UserImageUrl = userImageUrl From e24b67cfa97bfca382801751eb4f3e9c38904482 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Tue, 13 Sep 2022 15:16:39 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 38 ++++----- modules/cloudbrain/cloudbrain.go | 6 +- modules/grampus/grampus.go | 84 +++++++++--------- modules/modelarts/modelarts.go | 142 +++++++++++++++---------------- routers/repo/cloudbrain.go | 2 +- routers/repo/grampus.go | 4 +- routers/repo/modelarts.go | 9 +- 7 files changed, 146 insertions(+), 139 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 12c76ce57..f93b653e1 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -171,25 +171,25 @@ type Cloudbrain struct { ImageID string //grampus image_id AiCenter string //grampus ai center: center_id+center_name - TrainUrl string //输出模型的obs路径 - BranchName string //分支名称 - Parameters string //传给modelarts的param参数 - BootFile string //启动文件 - DataUrl string //数据集的obs路径 - LogUrl string //日志输出的obs路径 - PreVersionId int64 //父版本的版本id - FlavorCode string //modelarts上的规格id - Description string `xorm:"varchar(256)"` //描述 - WorkServerNumber int //节点数 - FlavorName string //规格名称 - EngineName string //引擎名称 - TotalVersionCount int //任务的所有版本数量,包括删除的 - LabelName string //标签名称 - ModelName string //模型名称 - ModelVersion string //模型版本 - CkptName string //权重文件名称 - PreTrainingModelUrl string //预训练模型地址 - ResultUrl string //推理结果的obs路径 + TrainUrl string //输出模型的obs路径 + BranchName string //分支名称 + Parameters string //传给modelarts的param参数 + BootFile string //启动文件 + DataUrl string //数据集的obs路径 + LogUrl string //日志输出的obs路径 + PreVersionId int64 //父版本的版本id + FlavorCode string //modelarts上的规格id + Description string `xorm:"varchar(256)"` //描述 + WorkServerNumber int //节点数 + FlavorName string //规格名称 + EngineName string //引擎名称 + TotalVersionCount int //任务的所有版本数量,包括删除的 + LabelName string //标签名称 + ModelName string //模型名称 + ModelVersion string //模型版本 + CkptName string //权重文件名称 + PreTrainModelUrl string //预训练模型地址 + ResultUrl string //推理结果的obs路径 User *User `xorm:"-"` Repo *Repository `xorm:"-"` diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 5a4d2fe05..4e527b6bf 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -79,7 +79,7 @@ type GenerateCloudBrainTaskReq struct { CkptName string LabelName string PreTrainModelPath string - PreTrainingModelUrl string + PreTrainModelUrl string Spec *models.Specification } @@ -279,7 +279,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { }, } - if req.PreTrainingModelUrl != "" { //预训练 + if req.PreTrainModelUrl != "" { //预训练 volumes = append(volumes, models.Volume{ HostPath: models.StHostPath{ Path: req.PreTrainModelPath, @@ -372,7 +372,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { CkptName: req.CkptName, ResultUrl: req.ResultPath, LabelName: req.LabelName, - PreTrainingModelUrl: req.PreTrainingModelUrl, + PreTrainModelUrl: req.PreTrainModelUrl, CreatedUnix: createTime, UpdatedUnix: createTime, CommitID: req.CommitID, diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index f434a484c..45c127141 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -63,16 +63,16 @@ type GenerateTrainJobReq struct { ComputeResource string ProcessType string - DatasetNames string - DatasetInfos map[string]models.DatasetInfo - Params string - ModelName string - LabelName string - CkptName string - ModelVersion string - PreTrainModelPath string - PreTrainingModelUrl string - Spec *models.Specification + DatasetNames string + DatasetInfos map[string]models.DatasetInfo + Params string + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelPath string + PreTrainModelUrl string + Spec *models.Specification } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { @@ -102,38 +102,38 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.JobInfo.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeC2Net, - Uuid: req.Uuid, - DatasetName: req.DatasetNames, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: req.ComputeResource, - ImageID: req.ImageId, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - Description: req.Description, - WorkServerNumber: req.WorkServerNumber, - EngineName: req.EngineName, - VersionCount: req.VersionCount, - TotalVersionCount: req.TotalVersionCount, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: req.Spec, - ModelName: req.ModelName, - ModelVersion: req.ModelVersion, - LabelName: req.LabelName, - PreTrainingModelUrl: req.PreTrainingModelUrl, - CkptName: req.CkptName, + Status: TransTrainJobStatus(jobResult.JobInfo.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeC2Net, + Uuid: req.Uuid, + DatasetName: req.DatasetNames, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: req.ComputeResource, + ImageID: req.ImageId, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if err != nil { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index ead824b60..f35601191 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -75,40 +75,40 @@ var ( ) type GenerateTrainJobReq struct { - JobName string - DisplayJobName string - Uuid string - Description string - CodeObsPath string - BootFile string - BootFileUrl string - DataUrl string - TrainUrl string - LogUrl string - PoolID string - WorkServerNumber int - EngineID int64 - Parameters []models.Parameter - CommitID string - IsLatestVersion string - Params string - BranchName string - PreVersionId int64 - PreVersionName string - FlavorCode string - FlavorName string - VersionCount int - EngineName string - TotalVersionCount int - UserImageUrl string - UserCommand string - DatasetName string - Spec *models.Specification - ModelName string - LabelName string - CkptName string - ModelVersion string - PreTrainingModelUrl string + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + PreVersionId int64 + PreVersionName string + FlavorCode string + FlavorName string + VersionCount int + EngineName string + TotalVersionCount int + UserImageUrl string + UserCommand string + DatasetName string + Spec *models.Specification + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelUrl string } type GenerateInferenceJobReq struct { @@ -412,43 +412,43 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error } jobId := strconv.FormatInt(jobResult.JobID, 10) createErr = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobId, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: models.NPUResource, - EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - LogUrl: req.LogUrl, - FlavorCode: req.Spec.SourceSpecId, - Description: req.Description, - WorkServerNumber: req.WorkServerNumber, - FlavorName: req.FlavorName, - EngineName: req.EngineName, - VersionCount: req.VersionCount, - TotalVersionCount: req.TotalVersionCount, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: req.Spec, - ModelName: req.ModelName, - ModelVersion: req.ModelVersion, - LabelName: req.LabelName, - PreTrainingModelUrl: req.PreTrainingModelUrl, - CkptName: req.CkptName, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobId, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: models.NPUResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.Spec.SourceSpecId, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if createErr != nil { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 56a485b66..8cd45e06f 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -334,7 +334,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion req.PreTrainModelPath = setting.Attachment.Minio.RealPath + form.PreTrainModelUrl - req.PreTrainingModelUrl = form.PreTrainModelUrl + req.PreTrainModelUrl = form.PreTrainModelUrl } diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index b32070a84..ed869e76e 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -389,7 +389,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain req.LabelName = form.LabelName req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion - req.PreTrainingModelUrl = form.PreTrainModelUrl + req.PreTrainModelUrl = form.PreTrainModelUrl } @@ -617,7 +617,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain req.LabelName = form.LabelName req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion - req.PreTrainingModelUrl = form.PreTrainModelUrl + req.PreTrainModelUrl = form.PreTrainModelUrl } diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 121c4fd78..cb4b2c1cc 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1010,6 +1010,13 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["engine_id"] = task.EngineID ctx.Data["datasetType"] = models.TypeCloudBrainTwo + //pretrain model + ctx.Data["model_name"] = task.ModelName + ctx.Data["model_version"] = task.ModelVersion + ctx.Data["ckpt_name"] = task.CkptName + ctx.Data["label_names"] = task.LabelName + ctx.Data["pre_train_model_url"] = task.PreTrainModelUrl + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { ctx.ServerError("getConfigList failed:", err) @@ -1370,7 +1377,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) req.LabelName = form.LabelName req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion - req.PreTrainingModelUrl = form.PreTrainModelUrl + req.PreTrainModelUrl = form.PreTrainModelUrl } From 9ce1b017b7bab8858f1ff45b2390b492ecfd56c4 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Tue, 13 Sep 2022 16:32:30 +0800 Subject: [PATCH 05/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/grampus/grampus.go | 6 +++--- modules/setting/setting.go | 15 +++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 45c127141..3cdd59c5c 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -22,9 +22,6 @@ const ( GpuWorkDir = "/tmp/" NpuWorkDir = "/cache/" - CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" + - "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" - CodeArchiveName = "master.zip" ) @@ -34,6 +31,9 @@ var ( ImageInfos *setting.StImageInfosModelArts SpecialPools *models.SpecialPools + + CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/" + setting.Grampus.SyncScriptProject + "/archive/master.zip;" + + "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) type GenerateTrainJobReq struct { diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 1992baf54..7d726a773 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -583,12 +583,13 @@ var ( //grampus config Grampus = struct { - Env string - Host string - UserName string - Password string - SpecialPools string - C2NetSequence string + Env string + Host string + UserName string + Password string + SpecialPools string + C2NetSequence string + SyncScriptProject string }{} C2NetInfos *C2NetSqInfos @@ -1558,6 +1559,8 @@ func getGrampusConfig() { log.Error("Unmarshal(C2NetSequence) failed:%v", err) } } + Grampus.SyncScriptProject = sec.Key("SYNC_SCRIPT_PROJECT").MustString("script_for_grampus") + } func SetRadarMapConfig() { From 601b7873b9f01bf41579a31ccec8a89b74092b58 Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 14 Sep 2022 12:19:55 +0800 Subject: [PATCH 06/12] fix issue --- templates/custom/select_model.tmpl | 37 +++++ templates/repo/cloudbrain/trainjob/new.tmpl | 4 +- templates/repo/grampus/trainjob/gpu/new.tmpl | 3 +- templates/repo/grampus/trainjob/npu/new.tmpl | 2 + templates/repo/modelarts/trainjob/new.tmpl | 6 +- .../repo/modelarts/trainjob/version_new.tmpl | 4 +- web_src/js/features/cloudbrainShow.js | 130 ++++++++++++++++++ 7 files changed, 178 insertions(+), 8 deletions(-) create mode 100644 templates/custom/select_model.tmpl diff --git a/templates/custom/select_model.tmpl b/templates/custom/select_model.tmpl new file mode 100644 index 000000000..81332b873 --- /dev/null +++ b/templates/custom/select_model.tmpl @@ -0,0 +1,37 @@ + +
+   +
+ +
+
+ +
+
+ + +
+ + + + +
\ No newline at end of file diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index 709490ac1..b2cff22cc 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -70,7 +70,7 @@
{{template "repo/header" .}}
- + {{template "base/alert" .}}

{{.i18n.Tr "repo.modelarts.train_job.new"}} @@ -168,7 +168,7 @@ {{end}}

- + {{template "custom/select_model" .}} - + {{template "custom/select_model" .}}
diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl index 88a41779e..a11d84bb3 100755 --- a/templates/repo/grampus/trainjob/npu/new.tmpl +++ b/templates/repo/grampus/trainjob/npu/new.tmpl @@ -57,6 +57,7 @@
{{template "repo/header" .}}
+ {{template "base/alert" .}}

{{.i18n.Tr "repo.modelarts.train_job.new"}} @@ -149,6 +150,7 @@ {{end}}

+ {{template "custom/select_model" .}}
- - + + {{template "custom/select_model" .}}
diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl index c95f5699a..ffc1045e8 100644 --- a/templates/repo/modelarts/trainjob/version_new.tmpl +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -55,7 +55,7 @@
{{template "repo/header" .}}
- + {{template "base/alert" .}}

{{.i18n.Tr "repo.modelarts.train_job.new"}} @@ -154,7 +154,7 @@

- + {{template "custom/select_model" .}}
diff --git a/web_src/js/features/cloudbrainShow.js b/web_src/js/features/cloudbrainShow.js index 73a8ed7b6..ac1263b9d 100644 --- a/web_src/js/features/cloudbrainShow.js +++ b/web_src/js/features/cloudbrainShow.js @@ -443,4 +443,134 @@ export default async function initCloudrainSow() { html += "
"; $(`#dir_list${version_name}`).append(html); } + + let nameMap, nameList; + let RepoLink = $(".cloudbrain-type").data("repo-link"); + let type = $(".cloudbrain-type").data("cloudbrain-type"); + let flagModel = $(".cloudbrain-type").data("flag-model"); + // 获取模型列表和模型名称对应的模型版本 + $(document).ready(function () { + if (!flagModel) return; + else { + $.get( + `${RepoLink}/modelmanage/query_model_for_predict?type=${type}`, + (data) => { + nameMap = data.nameMap; + nameList = data.nameList; + let html = ""; + nameList.forEach((element) => { + html += `
${element}
`; + }); + if (nameList.length !== 0) { + $("#model_name").append(html); + } + let faildModelName = $('input[name="model_name"]').val(); + let faildModelVersion = $('input[name="model_version"]').val(); + let faildTrainUrl = $('input[name="pre_train_model_url"]').val(); + let faildCkptName = $('input[name="ckpt_name"]').val(); + // 新建错误的表单返回初始化 + if (faildModelName) { + $("#select_model").dropdown("set text", faildModelName); + $("#select_model").dropdown("set value", faildModelName); + $("#select_model_version").dropdown("set text", faildModelVersion); + $("#select_model_version").dropdown("set value", faildTrainUrl); + $("#select_model_checkpoint").dropdown("set text", faildCkptName); + $("#select_model_checkpoint").dropdown("set value", faildCkptName); + } + } + ); + } + $("#select_model").dropdown({ + onChange: function (value, text, $selectedItem) { + $("#model_name_version").empty(); + let html = ""; + nameMap[value].forEach((element) => { + let { TrainTaskInfo } = element; + TrainTaskInfo = JSON.parse(TrainTaskInfo); + html += `
${element.Version}
`; + }); + $("#model_name_version").append(html); + const initVersionText = $( + "#model_name_version div.item:first-child" + ).text(); + const initVersionValue = $( + "#model_name_version div.item:first-child" + ).data("value"); + + $("#select_model_version").dropdown("set text", initVersionText); + $("#select_model_version").dropdown( + "set value", + initVersionValue, + initVersionText, + $("#model_name_version div.item:first-child") + ); + }, + }); + + $("#select_model_version").dropdown({ + onChange: function (value, text, $selectedItem) { + const dataID = + $selectedItem && $selectedItem[0].getAttribute("data-id"); + $("input#ai_model_version").val(text); + $("#select_model_checkpoint").addClass("loading"); + $("#model_checkpoint").empty(); + let html = ""; + loadCheckpointList(dataID).then((res) => { + res.forEach((element) => { + const ckptSuffix = element.FileName.split("."); + const loadCheckpointFile = [ + "ckpt", + "pb", + "h5", + "json", + "pkl", + "pth", + "t7", + "pdparams", + "onnx", + "pbtxt", + "keras", + "mlmodel", + "cfg", + "pt", + ]; + if ( + !element.IsDir && + loadCheckpointFile.includes(ckptSuffix[ckptSuffix.length - 1]) + ) { + html += `
${element.FileName}
`; + } + }); + $("#model_checkpoint").append(html); + $("#select_model_checkpoint").removeClass("loading"); + const initVersionText = $( + "#model_checkpoint div.item:first-child" + ).text(); + const initVersionValue = $( + "#model_checkpoint div.item:first-child" + ).data("value"); + + $("#select_model_checkpoint").dropdown("set text", initVersionText); + $("#select_model_checkpoint").dropdown( + "set value", + initVersionValue, + initVersionText, + $("#model_name_version div.item:first-child") + ); + }); + }, + }); + }); + + function loadCheckpointList(value) { + return new Promise((resolve, reject) => { + $.get( + `${RepoLink}/modelmanage/query_modelfile_for_predict`, + { ID: value }, + (data) => { + resolve(data); + } + ); + }); + } } From 4752033910ed48ad3c6a9fe3cccce61574468f92 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 15:07:53 +0800 Subject: [PATCH 07/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/repo/grampus.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index ed869e76e..dee8fbdd1 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -887,6 +887,10 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo } } + if pretrainModelFileName != "" { + paramCode += " --pretrainmodelname" + "=" + pretrainModelFileName + } + var commandCode string if processorType == grampus.ProcessorTypeNPU { commandCode = "/bin/bash /home/work/run_train_for_openi.sh " + workDir + "code/" + strings.ToLower(repoName) + "/" + bootFile + " /tmp/log/train.log" + paramCode + ";" From cc8b76eacaab2f3e9a6281c9c0d69ea1a591d5c1 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 15:25:41 +0800 Subject: [PATCH 08/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/modelarts/modelarts.go | 5 +++++ routers/repo/modelarts.go | 17 +++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index f35601191..5f318a546 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -598,6 +598,11 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index cb4b2c1cc..13ae93dcf 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1656,6 +1656,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ }) } + if form.ModelName != "" { //使用预训练模型训练 + ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName + param = append(param, models.Parameter{ + Label: modelarts.CkptUrl, + Value: "s3:/" + ckptUrl, + }) + } + // //save param config // if isSaveParam == "on" { // saveparams := append(param, models.Parameter{ @@ -1730,6 +1738,15 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ DatasetName: datasetNames, Spec: spec, } + + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand req.UserImageUrl = userImageUrl From 8187c87e3df47528dc93faf0e80f06efc24eecaa Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 15:35:48 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/grampus/grampus.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 3cdd59c5c..76f33bf81 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -32,7 +32,7 @@ var ( SpecialPools *models.SpecialPools - CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/" + setting.Grampus.SyncScriptProject + "/archive/master.zip;" + + CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget https://git.openi.org.cn/OpenIOSSG/" + setting.Grampus.SyncScriptProject + "/archive/master.zip;" + "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) From fee45b7b0044951764da40fc36b7e290cefb50ce Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 15:51:08 +0800 Subject: [PATCH 10/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/grampus/grampus.go | 2 +- routers/repo/grampus.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 76f33bf81..41bceddd7 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -32,7 +32,7 @@ var ( SpecialPools *models.SpecialPools - CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget https://git.openi.org.cn/OpenIOSSG/" + setting.Grampus.SyncScriptProject + "/archive/master.zip;" + + CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" + "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index dee8fbdd1..831551841 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -841,7 +841,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo workDir = grampus.GpuWorkDir } - command += "pwd;cd " + workDir + grampus.CommandPrepareScript + command += "pwd;cd " + workDir + fmt.Sprint(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject) //download code & dataset if processorType == grampus.ProcessorTypeNPU { commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" From c94dd366d1b3edea897c7548414ce1c9ab18a85f Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 16:00:25 +0800 Subject: [PATCH 11/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/repo/grampus.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 831551841..81b1cf743 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -841,7 +841,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo workDir = grampus.GpuWorkDir } - command += "pwd;cd " + workDir + fmt.Sprint(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject) + command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject) //download code & dataset if processorType == grampus.ProcessorTypeNPU { commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" From f7677b1cb5e4224a602d3586f475531776e3e9cb Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Wed, 14 Sep 2022 16:09:47 +0800 Subject: [PATCH 12/12] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/grampus/grampus.go | 2 +- routers/repo/grampus.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 41bceddd7..9ff2ed212 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -33,7 +33,7 @@ var ( SpecialPools *models.SpecialPools CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" + - "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" + "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) type GenerateTrainJobReq struct { diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 81b1cf743..cae7be622 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -841,7 +841,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo workDir = grampus.GpuWorkDir } - command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject) + command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) //download code & dataset if processorType == grampus.ProcessorTypeNPU { commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';"