diff --git a/models/cloudbrain.go b/models/cloudbrain.go index e43b86030..c05ba0f21 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "path" "strconv" "strings" "time" @@ -188,6 +189,7 @@ type Cloudbrain struct { ModelName string //模型名称 ModelVersion string //模型版本 CkptName string //权重文件名称 + PreTrainModelUrl string //预训练模型地址 ResultUrl string //推理结果的obs路径 User *User `xorm:"-"` @@ -655,6 +657,8 @@ type FlavorInfo struct { UnitPrice int64 `json:"unitPrice"` } + + type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } @@ -2296,9 +2300,10 @@ func GetCloudbrainByIds(ids []int64) ([]*Cloudbrain, error) { type DatasetInfo struct { DataLocalPath string Name string + FullName string } -func GetDatasetInfo(uuidStr string) (map[string]DatasetInfo, string, error) { +func GetDatasetInfo(uuidStr string, grampusType ...string) (map[string]DatasetInfo, string, error) { var datasetNames string uuids := strings.Split(uuidStr, ";") if len(uuids) > setting.MaxDatasetNum { @@ -2331,16 +2336,26 @@ func GetDatasetInfo(uuidStr string) (map[string]DatasetInfo, string, error) { return nil, datasetNames, errors.New("the dataset name is same") } } + var dataLocalPath string + if len(grampusType) > 0 { + if grampusType[0] == GPU { + dataLocalPath = setting.Attachment.Minio.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + } else { + dataLocalPath = setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + "/" + } - dataLocalPath := setting.Attachment.Minio.RealPath + - setting.Attachment.Minio.Bucket + "/" + - setting.Attachment.Minio.BasePath + - AttachmentRelativePath(attach.UUID) + - attach.UUID + } else { + dataLocalPath = setting.Attachment.Minio.RealPath + + setting.Attachment.Minio.Bucket + "/" + + setting.Attachment.Minio.BasePath + + AttachmentRelativePath(attach.UUID) + + attach.UUID + } datasetInfos[attach.UUID] = DatasetInfo{ DataLocalPath: dataLocalPath, Name: fileName, + FullName: attach.Name, } if i == 0 { datasetNames = attach.Name diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 5bd294f2a..48e23efac 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -23,6 +23,11 @@ type CreateCloudBrainForm struct { BootFile string `form:"boot_file"` Params string `form:"run_para_list"` BranchName string `form:"branch_name"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` DatasetName string `form:"dataset_name"` SpecId int64 `form:"spec_id"` } diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 21008ea09..414a7c25d 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -18,6 +18,11 @@ type CreateGrampusTrainJobForm struct { WorkServerNumber int `form:"work_server_number" binding:"Required"` Image string `form:"image"` DatasetName string `form:"dataset_name"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` SpecId int64 `form:"spec_id"` } diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index 23e1f325a..ced5ea1e8 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -48,6 +48,11 @@ type CreateModelArtsTrainJobForm struct { FlavorName string `form:"flaver_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"` SpecId int64 `form:"spec_id" binding:"Required"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` } type CreateModelArtsInferenceJobForm struct { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 748af4a29..4e527b6bf 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -24,6 +24,7 @@ const ( CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" + PretrainModelMountPath = "/pretrainmodel" LogFile = "log.txt" BenchMarkMountPath = "/benchmark" BenchMarkResourceID = 1 @@ -77,6 +78,8 @@ type GenerateCloudBrainTaskReq struct { ModelVersion string CkptName string LabelName string + PreTrainModelPath string + PreTrainModelUrl string Spec *models.Specification } @@ -276,6 +279,16 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { }, } + if req.PreTrainModelUrl != "" { //预训练 + volumes = append(volumes, models.Volume{ + HostPath: models.StHostPath{ + Path: req.PreTrainModelPath, + MountPath: PretrainModelMountPath, + ReadOnly: true, + }, + }) + } + if len(req.DatasetInfos) == 1 { volumes = append(volumes, models.Volume{ HostPath: models.StHostPath{ @@ -359,6 +372,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { CkptName: req.CkptName, ResultUrl: req.ResultPath, LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, CreatedUnix: createTime, UpdatedUnix: createTime, CommitID: req.CommitID, diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 687fb4959..83fc3b1d4 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -22,9 +22,6 @@ const ( GpuWorkDir = "/tmp/" NpuWorkDir = "/cache/" - CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" + - "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" - CodeArchiveName = "master.zip" ) @@ -34,6 +31,9 @@ var ( ImageInfos *setting.StImageInfosModelArts SpecialPools *models.SpecialPools + + CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" + + "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) type GenerateTrainJobReq struct { @@ -62,8 +62,16 @@ type GenerateTrainJobReq struct { TotalVersionCount int ComputeResource string ProcessType string - DatasetName string + + DatasetNames string + DatasetInfos map[string]models.DatasetInfo Params string + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelPath string + PreTrainModelUrl string Spec *models.Specification } @@ -72,6 +80,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error centerID, centerName := getCentersParamter(ctx, req) + log.Info("grampus Command:" + req.Command) + jobResult, err := createJob(models.CreateGrampusJobRequest{ Name: req.JobName, Tasks: []models.GrampusTasks{ @@ -103,7 +113,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error JobType: string(models.JobTypeTrain), Type: models.TypeC2Net, Uuid: req.Uuid, - DatasetName: req.DatasetName, + DatasetName: req.DatasetNames, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, ComputeResource: req.ComputeResource, @@ -121,6 +131,11 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if err != nil { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index d03e56ad8..6b3d1f128 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -104,6 +104,11 @@ type GenerateTrainJobReq struct { UserCommand string DatasetName string Spec *models.Specification + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelUrl string } type GenerateInferenceJobReq struct { @@ -440,6 +445,11 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if createErr != nil { @@ -589,6 +599,11 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 97cb5ed62..1a33870b6 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -586,12 +586,13 @@ var ( //grampus config Grampus = struct { - Env string - Host string - UserName string - Password string - SpecialPools string - C2NetSequence string + Env string + Host string + UserName string + Password string + SpecialPools string + C2NetSequence string + SyncScriptProject string }{} C2NetInfos *C2NetSqInfos @@ -1575,6 +1576,8 @@ func getGrampusConfig() { log.Error("Unmarshal(C2NetSequence) failed:%v", err) } } + Grampus.SyncScriptProject = sec.Key("SYNC_SCRIPT_PROJECT").MustString("script_for_grampus") + } func SetRadarMapConfig() { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 54e24fe97..4004b6efc 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -141,6 +141,35 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode") + if ctx.Cloudbrain != nil { + ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName + ctx.Data["image"] = ctx.Cloudbrain.Image + ctx.Data["image_id"] = ctx.Cloudbrain.ImageID + ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile + ctx.Data["description"] = ctx.Cloudbrain.Description + spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID) + if spec != nil { + ctx.Data["spec_id"] = spec.ID + } + ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters + ctx.Data["model_name"] = ctx.Cloudbrain.ModelName + ctx.Data["label_name"] = ctx.Cloudbrain.LabelName + ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName + ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion + ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl + ctx.Data["compute_resource"] = ctx.Cloudbrain.ComputeResource + ctx.Data["attachment"] = ctx.Cloudbrain.Uuid + ctx.Data["cluster_type"] = models.OpenICluster + _, _, datasetNames, _, err := getDatasUrlListByUUIDS(ctx.Cloudbrain.Uuid) + if err != nil { + log.Info("query dataset error," + err.Error()) + ctx.Data["dataset_name"] = "" + } else { + ctx.Data["dataset_name"] = datasetNames + } + + } + return nil } @@ -187,8 +216,12 @@ func CloudBrainNew(ctx *context.Context) { ctx.Data["PageIsGPUDebug"] = true ctx.HTML(200, tplCloudBrainNew) } - func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { + ctx.Data["IsCreate"] = true + cloudBrainCreate(ctx, form) +} + +func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { ctx.Data["PageIsCloudBrain"] = true displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) @@ -349,6 +382,16 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelPath = setting.Attachment.Minio.RealPath + form.PreTrainModelUrl + req.PreTrainModelUrl = form.PreTrainModelUrl + + } + err = cloudbrain.GenerateTask(req) if err != nil { cloudBrainNewDataPrepare(ctx) @@ -362,6 +405,11 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } +func CloudBrainTrainJobVersionCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { + ctx.Data["IsCreate"] = false + cloudBrainCreate(ctx, form) +} + func loadCodeAndMakeModelPath(repo *models.Repository, codePath string, branchName string, jobName string, resultPath string) string { err := downloadCode(repo, codePath, branchName) if err != nil { @@ -2626,6 +2674,15 @@ func BenchmarkDel(ctx *context.Context) { } func CloudBrainTrainJobNew(ctx *context.Context) { + ctx.Data["IsCreate"] = true + cloudBrainTrainJobCreate(ctx) +} +func CloudBrainTrainJobVersionNew(ctx *context.Context) { + ctx.Data["IsCreate"] = false + cloudBrainTrainJobCreate(ctx) +} + +func cloudBrainTrainJobCreate(ctx *context.Context) { err := cloudBrainNewDataPrepare(ctx) if err != nil { ctx.ServerError("get new train-job info failed", err) @@ -2715,6 +2772,9 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { param += " --" + parameter.Label + "=" + parameter.Value } } + if form.CkptName != "" { + param += " --pretrainmodelname" + "=" + form.CkptName + } command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 0100e6eb2..03d5f0298 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -13,8 +13,10 @@ import ( "time" "code.gitea.io/gitea/services/cloudbrain/resource" + "code.gitea.io/gitea/services/reward/point/account" + "code.gitea.io/gitea/modules/auth" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/grampus" @@ -45,7 +47,7 @@ const ( ) func GrampusTrainJobGPUNew(ctx *context.Context) { - ctx.Data["datasetType"] = models.TypeCloudBrainOne + ctx.Data["IsCreate"] = true err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) if err != nil { ctx.ServerError("get new train-job info failed", err) @@ -56,7 +58,7 @@ func GrampusTrainJobGPUNew(ctx *context.Context) { } func GrampusTrainJobNPUNew(ctx *context.Context) { - ctx.Data["datasetType"] = models.TypeCloudBrainTwo + ctx.Data["IsCreate"] = true err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) if err != nil { ctx.ServerError("get new train-job info failed", err) @@ -138,9 +140,56 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err ctx.Data["WaitCount"] = waitCount } + if ctx.Cloudbrain != nil { + ctx.Data["attachment"] = ctx.Cloudbrain.Uuid + ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile + ctx.Data["image_id"] = ctx.Cloudbrain.ImageID + ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters + ctx.Data["description"] = ctx.Cloudbrain.Description + ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName + ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName + ctx.Data["WorkServerNumber"] = ctx.Cloudbrain.WorkServerNumber + if ctx.Cloudbrain.Image != "" { + ctx.Data["image"] = ctx.Cloudbrain.Image + } else { + ctx.Data["image"] = ctx.Cloudbrain.EngineName + } + ctx.Data["dataset_name"] = ctx.Cloudbrain.DatasetName + ctx.Data["model_name"] = ctx.Cloudbrain.ModelName + + ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion + ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName + ctx.Data["label_names"] = ctx.Cloudbrain.LabelName + ctx.Data["PreTrainModelUrl"] = ctx.Cloudbrain.PreTrainModelUrl + spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID) + if spec != nil { + ctx.Data["spec_id"] = spec.ID + } + + } return nil } +func GrampusTrainJobVersionNew(ctx *context.Context) { + task := ctx.Cloudbrain + ctx.Data["IsCreate"] = false + if task.ComputeResource == models.GPUResource { + err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) + if err != nil { + ctx.ServerError("get new train-job version info failed", err) + return + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) + } else if task.ComputeResource == models.NPUResource { + err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) + if err != nil { + ctx.ServerError("get new train-job version info failed", err) + return + } + ctx.HTML(200, tplGrampusTrainJobNPUNew) + } +} + func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ JobType: models.JobTypeTrain, @@ -205,6 +254,7 @@ func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error } func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { + ctx.Data["IsCreate"] = true displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment @@ -214,9 +264,9 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" - dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid branchName := form.BranchName image := strings.TrimSpace(form.Image) + tpl := tplGrampusTrainJobGPUNew lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) isOk, err := lock.Lock(models.CloudbrainKeyDuration) @@ -230,7 +280,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if !jobNamePattern.MatchString(displayJobName) { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } @@ -238,7 +288,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err != nil || !bootFileExist { log.Error("Get bootfile error:", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form) return } @@ -247,13 +297,13 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } @@ -262,7 +312,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } @@ -272,14 +322,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } } @@ -292,7 +342,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain }) if err != nil || spec == nil { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("Resource specification not available", tpl, &form) return } @@ -304,11 +354,12 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //check dataset - attachment, err := models.GetAttachmentByUUID(uuid) + + datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU) if err != nil { - log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) + log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } @@ -321,7 +372,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } @@ -330,7 +381,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } @@ -338,7 +389,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := mkModelPath(modelPath); err != nil { log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } @@ -346,52 +397,102 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } + var datasetRemotePath, allFileName string + for _, datasetInfo := range datasetInfos { + if datasetRemotePath == "" { + datasetRemotePath = datasetInfo.DataLocalPath + allFileName = datasetInfo.FullName + } else { + datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + allFileName = allFileName + ";" + datasetInfo.FullName + } + + } + //prepare command - command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name) + preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) + + command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr("Create task failed, internal error", tpl, &form) return } commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ - JobName: jobName, - DisplayJobName: displayJobName, - ComputeResource: models.GPUResource, - ProcessType: grampus.ProcessorTypeGPU, - Command: command, - ImageUrl: image, - Description: description, - BootFile: bootFile, - Uuid: uuid, - CommitID: commitID, - BranchName: branchName, - Params: form.Params, - EngineName: image, - DatasetName: attachment.Name, + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: models.GPUResource, + ProcessType: grampus.ProcessorTypeGPU, + Command: command, + ImageUrl: image, + Description: description, + BootFile: bootFile, + Uuid: uuid, + CommitID: commitID, + BranchName: branchName, + Params: form.Params, + EngineName: image, + DatasetNames: datasetNames, + DatasetInfos: datasetInfos, + IsLatestVersion: modelarts.IsLatestVersion, VersionCount: modelarts.VersionCountOne, WorkServerNumber: 1, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } + err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) - ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func getPreTrainModelPath(pretrainModelDir string, fileName string) string { + index := strings.Index(pretrainModelDir, "/") + if index > 0 { + filterBucket := pretrainModelDir[index+1:] + return filterBucket + fileName + } else { + return "" + } + +} + +func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { + ctx.Data["IsCreate"] = false + computeResource := ctx.Query("compute_resource") + if computeResource == models.GPUResource { + GrampusTrainJobGpuCreate(ctx, form) + } else if computeResource == models.NPUResource { + GrampusTrainJobNpuCreate(ctx, form) + } else { + ctx.ServerError("resource error", errors.New("compute resource is not support")) + return + } + +} + func checkSpecialPool(ctx *context.Context, resourceType string) string { grampus.InitSpecialPool() if grampus.SpecialPools != nil { @@ -415,6 +516,7 @@ func checkSpecialPool(ctx *context.Context, resourceType string) string { } func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { + ctx.Data["IsCreate"] = true displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment @@ -424,11 +526,12 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := grampus.JobPath + jobName + modelarts.CodePath - dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" + //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion versionCount := modelarts.VersionCountOne engineName := form.EngineName + tpl := tplGrampusTrainJobNPUNew lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) isOk, err := lock.Lock(models.CloudbrainKeyDuration) @@ -442,7 +545,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if !jobNamePattern.MatchString(displayJobName) { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } @@ -450,7 +553,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err != nil || !bootFileExist { log.Error("Get bootfile error:", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form) return } @@ -459,13 +562,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } @@ -474,7 +577,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } @@ -484,14 +587,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } } @@ -504,7 +607,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain }) if err != nil || spec == nil { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("Resource specification not available", tpl, &form) return } if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { @@ -515,11 +618,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //check dataset - attachment, err := models.GetAttachmentByUUID(uuid) + datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU) if err != nil { - log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) + log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } @@ -532,7 +635,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } @@ -540,23 +643,36 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } + var datasetRemotePath, allFileName string + for _, datasetInfo := range datasetInfos { + if datasetRemotePath == "" { + datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" + allFileName = datasetInfo.FullName + } else { + datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" + allFileName = allFileName + ";" + datasetInfo.FullName + } + + } + //prepare command - command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+"'"+attachment.Name+"'", bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) + preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) + command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr("Create task failed, internal error", tpl, &form) return } @@ -569,7 +685,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ProcessType: grampus.ProcessorTypeNPU, Command: command, ImageId: form.ImageID, - DataUrl: dataObsPath, Description: description, CodeObsPath: codeObsPath, BootFileUrl: codeObsPath + bootFile, @@ -583,15 +698,24 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, - DatasetName: attachment.Name, + DatasetNames: datasetNames, + DatasetInfos: datasetInfos, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) - ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") @@ -802,7 +926,7 @@ func GrampusGetLog(ctx *context.Context) { return } -func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName string) (string, error) { +func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) { var command string workDir := grampus.NpuWorkDir @@ -810,22 +934,22 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo workDir = grampus.GpuWorkDir } - command += "pwd;cd " + workDir + grampus.CommandPrepareScript + command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) //download code & dataset if processorType == grampus.ProcessorTypeNPU { - commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" + commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'" + commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) command += commandDownload } else if processorType == grampus.ProcessorTypeGPU { - commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';" + commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'" + commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) command += commandDownload } //unzip code & dataset - toolUnzip := "unzip -q '" - if strings.HasSuffix(datasetName, ".tar.gz") { - toolUnzip = "tar -zxvf '" - } - commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + toolUnzip + datasetName + "';" + unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) + + commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand command += commandUnzip command += "echo \"unzip finished;start to exec code;\";" @@ -856,6 +980,10 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo } } + if pretrainModelFileName != "" { + paramCode += " --pretrainmodelname" + "=" + pretrainModelFileName + } + var commandCode string if processorType == grampus.ProcessorTypeNPU { commandCode = "/bin/bash /home/work/run_train_for_openi.sh " + workDir + "code/" + strings.ToLower(repoName) + "/" + bootFile + " /tmp/log/train.log" + paramCode + ";" @@ -871,10 +999,10 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo //upload models if processorType == grampus.ProcessorTypeNPU { - commandUpload := "cd " + workDir + "script_for_grampus/;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;" + commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;" command += commandUpload } else if processorType == grampus.ProcessorTypeGPU { - commandUpload := "cd " + workDir + "script_for_grampus/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;" + commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;" command += commandUpload } @@ -885,6 +1013,38 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo return command, nil } +func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string { + commandDownloadTemp := commandDownload + if pretrainModelPath != "" { + commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'" + } + commandDownloadTemp += ";" + return commandDownloadTemp +} + +func generateDatasetUnzipCommand(datasetName string) string { + var unZipDatasetCommand string + + datasetNameArray := strings.Split(datasetName, ";") + if len(datasetNameArray) == 1 { //单数据集 + unZipDatasetCommand = "unzip -q '" + datasetName + "';" + if strings.HasSuffix(datasetName, ".tar.gz") { + unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';" + } + + } else { //多数据集 + for _, datasetNameTemp := range datasetNameArray { + if strings.HasSuffix(datasetName, ".tar.gz") { + unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetName + "';" + } else { + unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';" + } + } + + } + return unZipDatasetCommand +} + func downloadZipCode(ctx *context.Context, codePath, branchName string) error { archiveType := git.ZIP archivePath := codePath diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 4f3341e6f..1f69d4669 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -859,84 +859,6 @@ func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { } } -func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error { - ctx.Data["PageIsCloudBrain"] = true - - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //if !can { - // log.Error("the user can not create train-job") - // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job")) - // return - //} - - t := time.Now() - var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] - ctx.Data["display_job_name"] = displayJobName - - attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) - if err != nil { - ctx.ServerError("GetAllUserAttachments failed:", err) - return err - } - ctx.Data["attachments"] = attachs - - var resourcePools modelarts.ResourcePool - if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["resource_pools"] = resourcePools.Info - - var engines modelarts.Engine - if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["engines"] = engines.Info - - var versionInfos modelarts.VersionInfo - if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["engine_versions"] = versionInfos.Version - - prepareCloudbrainTwoTrainSpecs(ctx) - - configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) - if err != nil { - ctx.ServerError("getConfigList failed:", err) - return err - } - var Parameters modelarts.Parameters - if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["params"] = Parameters.Parameter - ctx.Data["config_list"] = configList.ParaConfigs - ctx.Data["bootFile"] = form.BootFile - ctx.Data["uuid"] = form.Attachment - _, datasetNames, err := models.GetDatasetInfo(form.Attachment) - if err != nil { - log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) - return nil - } - ctx.Data["dataset_name"] = datasetNames - ctx.Data["branch_name"] = form.BranchName - ctx.Data["datasetType"] = models.TypeCloudBrainTwo - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") - ctx.Data["WaitCount"] = waitCount - setMultiNodeIfConfigureMatch(ctx) - - return nil -} - func TrainJobNewVersion(ctx *context.Context) { err := trainJobNewVersionDataPrepare(ctx) @@ -1002,12 +924,7 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["spec_id"] = spec.ID } - var Parameters modelarts.Parameters - if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["params"] = Parameters.Parameter + ctx.Data["run_para_list"] = task.Parameters branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) if err != nil { @@ -1030,104 +947,24 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["work_server_number"] = task.WorkServerNumber ctx.Data["flavor_name"] = task.FlavorName ctx.Data["engine_name"] = task.EngineName - ctx.Data["uuid"] = task.Uuid + ctx.Data["attachment"] = task.Uuid ctx.Data["flavor_code"] = task.FlavorCode ctx.Data["engine_id"] = task.EngineID ctx.Data["datasetType"] = models.TypeCloudBrainTwo - configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) - if err != nil { - ctx.ServerError("getConfigList failed:", err) - return err - } - ctx.Data["config_list"] = configList.ParaConfigs - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") - ctx.Data["WaitCount"] = waitCount - - return nil -} - -func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error { - ctx.Data["PageIsCloudBrain"] = true - var jobID = ctx.Params(":jobid") - // var versionName = ctx.Params(":version-name") - var versionName = ctx.Query("version_name") - - task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) - if err != nil { - log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) - return err - } - - t := time.Now() - var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] - ctx.Data["job_name"] = task.JobName - - attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) - if err != nil { - ctx.ServerError("GetAllUserAttachments failed:", err) - return err - } - ctx.Data["attachments"] = attachs - - var resourcePools modelarts.ResourcePool - if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["resource_pools"] = resourcePools.Info - - var engines modelarts.Engine - if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["engines"] = engines.Info - - var versionInfos modelarts.VersionInfo - if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["engine_versions"] = versionInfos.Version - - prepareCloudbrainTwoTrainSpecs(ctx) + //pretrain model + ctx.Data["model_name"] = task.ModelName + ctx.Data["model_version"] = task.ModelVersion + ctx.Data["ckpt_name"] = task.CkptName + ctx.Data["label_names"] = task.LabelName + ctx.Data["pre_train_model_url"] = task.PreTrainModelUrl - var Parameters modelarts.Parameters - if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["params"] = Parameters.Parameter - - outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath - ctx.Data["train_url"] = outputObsPath - - branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) - if err != nil { - ctx.ServerError("GetBranches error:", err) - return err - } - ctx.Data["branches"] = branches - ctx.Data["description"] = form.Description - ctx.Data["dataset_name"] = task.DatasetName - ctx.Data["work_server_number"] = form.WorkServerNumber - ctx.Data["flavor_name"] = form.FlavorName - ctx.Data["engine_name"] = form.EngineName - ctx.Data["flavor_code"] = task.FlavorCode - ctx.Data["engine_id"] = task.EngineID - ctx.Data["version_name"] = form.VersionName - - ctx.Data["bootFile"] = form.BootFile - ctx.Data["uuid"] = form.Attachment - ctx.Data["branch_name"] = form.BranchName configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { ctx.ServerError("getConfigList failed:", err) return err } ctx.Data["config_list"] = configList.ParaConfigs - ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount @@ -1161,7 +998,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber) if errStr != "" { - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) return } @@ -1170,7 +1007,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) isOk, err := lock.Lock(models.CloudbrainKeyDuration) if !isOk { log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobNew, &form) return } @@ -1179,13 +1016,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) return } @@ -1193,7 +1030,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -1201,7 +1038,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName) if err != nil || !bootFileExist { log.Error("Get bootfile error:", err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobNew, &form) return } @@ -1212,7 +1049,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) Cluster: models.OpenICluster, AiCenterCode: models.AICenterOfCloudBrainTwo}) if err != nil || spec == nil { - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form) return } @@ -1228,14 +1065,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) return } @@ -1252,7 +1089,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err := downloadCode(repo, codeLocalPath, branchName); err != nil { log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form) return } @@ -1260,14 +1097,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) //todo: upload code (send to file_server todo this work?) if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) return } if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) return } @@ -1276,7 +1113,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form) return } @@ -1288,7 +1125,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) err := json.Unmarshal([]byte(params), ¶meters) if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", params, err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) return } @@ -1314,7 +1151,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) if err != nil { log.Error("Failed to getDatasUrlListByUUIDS: %v", err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -1322,7 +1159,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) jsondatas, err := json.Marshal(datasUrlList) if err != nil { log.Error("Failed to Marshal: %v", err) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -1332,6 +1169,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) Value: string(jsondatas), }) } + if form.ModelName != "" { //使用预训练模型训练 + ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName + param = append(param, models.Parameter{ + Label: modelarts.CkptUrl, + Value: "s3:/" + ckptUrl, + }) + } //save param config // if isSaveParam == "on" { @@ -1400,6 +1244,15 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) DatasetName: datasetNames, Spec: spec, } + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } + userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand req.UserImageUrl = userImageUrl @@ -1414,7 +1267,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) err = modelarts.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) - trainJobErrorNewDataPrepare(ctx, form) + trainJobNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -1499,7 +1352,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber) if errStr != "" { - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) return } @@ -1507,13 +1360,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form) return } @@ -1552,7 +1405,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ isOk, err := lock.Lock(models.CloudbrainKeyDuration) if !isOk { log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobVersionNew, &form) return } @@ -1560,14 +1413,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID) if !canNewJob { - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form) return } if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } @@ -1575,7 +1428,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName) if err != nil || !bootFileExist { log.Error("Get bootfile error:", err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobVersionNew, &form) return } @@ -1586,13 +1439,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ Cluster: models.OpenICluster, AiCenterCode: models.AICenterOfCloudBrainTwo}) if err != nil || spec == nil { - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form) return } if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsTrainJobVersionNew, &form) return } @@ -1607,7 +1460,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ commitID, _ := gitRepo.GetBranchCommitID(branchName) if err := downloadCode(repo, codeLocalPath, branchName); err != nil { log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form) return } @@ -1615,14 +1468,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ //todo: upload code (send to file_server todo this work?) if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) return } if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) return } @@ -1632,7 +1485,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form) return } @@ -1646,7 +1499,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ err := json.Unmarshal([]byte(params), ¶meters) if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", params, err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) return } @@ -1672,7 +1525,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) if err != nil { log.Error("Failed to getDatasUrlListByUUIDS: %v", err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) return } @@ -1680,7 +1533,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ jsondatas, err := json.Marshal(datasUrlList) if err != nil { log.Error("Failed to Marshal: %v", err) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) return } @@ -1691,46 +1544,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ }) } - // //save param config - // if isSaveParam == "on" { - // saveparams := append(param, models.Parameter{ - // Label: modelarts.TrainUrl, - // Value: outputObsPath, - // }, models.Parameter{ - // Label: modelarts.DataUrl, - // Value: dataPath, - // }) - // if form.ParameterTemplateName == "" { - // log.Error("ParameterTemplateName is empty") - // versionErrorDataPrepare(ctx, form) - // ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) - // return - // } - - // _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ - // ConfigName: form.ParameterTemplateName, - // Description: form.PrameterDescription, - // DataUrl: dataPath, - // AppUrl: codeObsPath, - // BootFileUrl: codeObsPath + bootFile, - // TrainUrl: outputObsPath, - // Flavor: models.Flavor{ - // Code: flavorCode, - // }, - // WorkServerNum: workServerNumber, - // EngineID: int64(engineID), - // LogUrl: logObsPath, - // PoolID: poolID, - // Parameter: saveparams, - // }) + if form.ModelName != "" { //使用预训练模型训练 + ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName + param = append(param, models.Parameter{ + Label: modelarts.CkptUrl, + Value: "s3:/" + ckptUrl, + }) + } - // if err != nil { - // log.Error("Failed to CreateTrainJobConfig: %v", err) - // versionErrorDataPrepare(ctx, form) - // ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) - // return - // } - // } task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName) if err != nil { @@ -1765,6 +1586,15 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ DatasetName: datasetNames, Spec: spec, } + + if form.ModelName != "" { //使用预训练模型训练 + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand req.UserImageUrl = userImageUrl @@ -1772,7 +1602,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) - versionErrorDataPrepare(ctx, form) + trainJobNewVersionDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index afb7f3187..6d31cfe46 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1156,6 +1156,8 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) //m.Get("/get_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo.GetLogFromModelDir) //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.CloudBrainTrainJobVersionNew) + m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainTrainJobVersionCreate) }) m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.CloudBrainTrainJobNew) m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) @@ -1178,6 +1180,8 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) + m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.GrampusTrainJobVersionNew) + m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobVersionCreate) }) m.Group("/gpu", func() { m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusTrainJobGPUNew) diff --git a/templates/custom/select_model.tmpl b/templates/custom/select_model.tmpl new file mode 100644 index 000000000..81332b873 --- /dev/null +++ b/templates/custom/select_model.tmpl @@ -0,0 +1,37 @@ + +
+   +
+ +
+
+ +
+
+ + +
+ + + + +
\ No newline at end of file diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index 695726613..b07832752 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -70,7 +70,7 @@
{{template "repo/header" .}}
- + {{template "base/alert" .}}

{{.i18n.Tr "repo.modelarts.train_job.new"}} @@ -125,15 +125,14 @@ {{.i18n.Tr "cloudbrain.new_train_gpu_tooltips" "/code" "/dataset" "/model" | Safe}}

-
+
- {{.i18n.Tr "repo.cloudbrain_jobname_err"}}
- + {{.i18n.Tr "repo.cloudbrain_jobname_err"}}
@@ -168,7 +167,7 @@ {{end}}
- + {{template "custom/select_model" .}} -
@@ -228,33 +205,10 @@ style="margin-left: 0.5rem;cursor:pointer;color: rgba(3, 102, 214, 100);font-size: 14px;line-height: 26px;font-family: SourceHanSansSC-medium;">{{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} -
+
-
@@ -290,166 +244,8 @@
{{template "base/footer" .}} + - + - + + - +