From d83229936fece00f3f1610ced51e9c49e21dd252 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 29 Sep 2022 11:07:58 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9B=E5=BB=BANPU=E7=9A=84=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=AE=89=E5=85=A8=E6=A3=80=E6=B5=8B=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- modules/modelarts/modelarts.go | 22 ++-- routers/repo/aisafety.go | 190 +++++++++++++++++++++++++++++++++ routers/repo/modelarts.go | 1 + 3 files changed, 203 insertions(+), 10 deletions(-) diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 6b3d1f128..ca1195720 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -1,7 +1,6 @@ package modelarts import ( - "code.gitea.io/gitea/modules/modelarts_cd" "encoding/json" "errors" "fmt" @@ -9,6 +8,8 @@ import ( "strconv" "strings" + "code.gitea.io/gitea/modules/modelarts_cd" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" @@ -70,8 +71,8 @@ const ( var ( poolInfos *models.PoolInfos TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools - MultiNodeConfig *MultiNodes + SpecialPools *models.SpecialPools + MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { @@ -141,6 +142,7 @@ type GenerateInferenceJobReq struct { ResultUrl string Spec *models.Specification DatasetName string + JobType string } type VersionInfo struct { @@ -173,12 +175,12 @@ type ResourcePool struct { } `json:"resource_pool"` } -type MultiNodes struct{ +type MultiNodes struct { Info []OrgMultiNode `json:"multinode"` } -type OrgMultiNode struct{ +type OrgMultiNode struct { Org string `json:"org"` - Node []int `json:"node"` + Node []int `json:"node"` } // type Parameter struct { @@ -709,7 +711,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, JobName: req.JobName, - JobType: string(models.JobTypeInference), + JobType: req.JobType, }) if err != nil { log.Error("InsertCloudbrainTemp failed: %v", err.Error()) @@ -732,7 +734,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e JobID: jobID, JobName: req.JobName, DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeInference), + JobType: req.JobType, Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, @@ -798,8 +800,8 @@ func InitSpecialPool() { } } -func InitMultiNode(){ - if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{ +func InitMultiNode() { + if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" { json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) } diff --git a/routers/repo/aisafety.go b/routers/repo/aisafety.go index 4b53f737d..0f4b6ee1a 100644 --- a/routers/repo/aisafety.go +++ b/routers/repo/aisafety.go @@ -16,11 +16,14 @@ import ( "code.gitea.io/gitea/modules/aisafety" "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/services/cloudbrain/resource" + "code.gitea.io/gitea/services/reward/point/account" uuid "github.com/satori/go.uuid" ) @@ -315,7 +318,194 @@ func AiSafetyCreateForPost(ctx *context.Context) { } func createForNPU(ctx *context.Context, jobName string) { + VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) + BootFile := ctx.Query("BootFile") + displayJobName := ctx.Query("DisplayJobName") + description := ctx.Query("Description") + engineID := ctx.QueryInt("EngineID") + poolID := ctx.Query("PoolID") + //image := strings.TrimSpace(ctx.Query("Image")) + srcDataset := ctx.Query("srcDataset") //uuid + combatDataset := ctx.Query("combatDataset") //uuid + evaluationIndex := ctx.Query("evaluationIndex") + Params := ctx.Query("RunParaList") + specId := ctx.QueryInt64("SpecId") + + repo := ctx.Repo.Repository + + trainUrl := ctx.Query("TrainUrl") + modelName := ctx.Query("ModelName") + modelVersion := ctx.Query("ModelVersion") + ckptName := ctx.Query("CkptName") + ckptUrl := "/" + trainUrl + ckptName + log.Info("ckpt url:" + ckptUrl) + + FlavorName := ctx.Query("FlavorName") + EngineName := ctx.Query("EngineName") + + isLatestVersion := modelarts.IsLatestVersion + VersionCount := modelarts.VersionCountOne + + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/" + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" + log.Info("ckpt url:" + ckptUrl) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNew, nil) + return + } + if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { + log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNew, nil) + return + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName) + + if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil { + log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNew, nil) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNew, nil) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNew, nil) + return + } + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNew, nil) + return + } + + var parameters models.Parameters + param := make([]models.Parameter, 0) + param = append(param, models.Parameter{ + Label: modelarts.ResultUrl, + Value: "s3:/" + resultObsPath, + }, models.Parameter{ + Label: modelarts.CkptUrl, + Value: "s3:/" + ckptUrl, + }) + uuid := srcDataset + ";" + combatDataset + datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) + if err != nil { + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNew, nil) + return + } + dataPath := dataUrl + jsondatas, err := json.Marshal(datasUrlList) + if err != nil { + log.Error("Failed to Marshal: %v", err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNew, nil) + return + } + if isMultiDataset { + param = append(param, models.Parameter{ + Label: modelarts.MultiDataUrl, + Value: string(jsondatas), + }) + } + + existDeviceTarget := false + if len(Params) != 0 { + err := json.Unmarshal([]byte(Params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", Params, err) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNew, nil) + return + } + for _, parameter := range parameters.Parameter { + if parameter.Label == modelarts.DeviceTarget { + existDeviceTarget = true + } + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + if !existDeviceTarget { + param = append(param, models.Parameter{ + Label: modelarts.DeviceTarget, + Value: modelarts.Ascend, + }) + } + + req := &modelarts.GenerateInferenceJobReq{ + JobName: jobName, + DisplayJobName: displayJobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + BootFile, + BootFile: BootFile, + TrainUrl: trainUrl, + WorkServerNumber: 1, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: param, //modelarts train parameters + CommitID: commitID, + BranchName: cloudbrain.DefaultBranchName, + Params: Params, + FlavorName: FlavorName, + EngineName: EngineName, + LabelName: evaluationIndex, + IsLatestVersion: isLatestVersion, + VersionCount: VersionCount, + TotalVersionCount: modelarts.TotalVersionCount, + ModelName: modelName, + ModelVersion: modelVersion, + CkptName: ckptName, + ResultUrl: resultObsPath, + Spec: spec, + DatasetName: datasetNames, + JobType: string(models.JobTypeModelSafety), + } + + err = modelarts.GenerateInferenceJob(ctx, req) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + modelSafetyNewDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNew, nil) + return + } } func createForGPU(ctx *context.Context, jobName string) { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 5487231a2..811ec0424 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -2200,6 +2200,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ResultUrl: resultObsPath, Spec: spec, DatasetName: datasetNames, + JobType: string(models.JobTypeInference), } err = modelarts.GenerateInferenceJob(ctx, req)