|
|
|
@@ -18,7 +18,6 @@ import ( |
|
|
|
"code.gitea.io/gitea/modules/cloudbrain" |
|
|
|
"code.gitea.io/gitea/modules/context" |
|
|
|
"code.gitea.io/gitea/modules/git" |
|
|
|
"code.gitea.io/gitea/modules/grampus" |
|
|
|
"code.gitea.io/gitea/modules/log" |
|
|
|
"code.gitea.io/gitea/modules/modelarts" |
|
|
|
"code.gitea.io/gitea/modules/setting" |
|
|
|
@@ -232,8 +231,6 @@ func syncAiSafetyTaskStatus(job *models.Cloudbrain) { |
|
|
|
queryTaskStatusFromCloudbrainTwo(job) |
|
|
|
} else if job.Type == models.TypeCloudBrainOne { |
|
|
|
queryTaskStatusFromCloudbrain(job) |
|
|
|
} else if job.Type == models.TypeC2Net { |
|
|
|
queryTaskStatusFromGrampus(job) |
|
|
|
} |
|
|
|
} else { |
|
|
|
if job.Status == string(models.ModelSafetyTesting) { |
|
|
|
@@ -260,55 +257,6 @@ func TimerHandleModelSafetyTestTask() { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func queryTaskStatusFromGrampus(task *models.Cloudbrain) { |
|
|
|
log.Info("The task not finished,name=" + task.DisplayJobName) |
|
|
|
if task.DeletedAt.IsZero() { //normal record |
|
|
|
result, err := grampus.GetJob(task.JobID) |
|
|
|
resultJson, _ := json.Marshal(result) |
|
|
|
log.Info("resultJson=" + string(resultJson)) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetJob failed:" + err.Error()) |
|
|
|
return |
|
|
|
} |
|
|
|
if result != nil { |
|
|
|
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { |
|
|
|
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] |
|
|
|
} |
|
|
|
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) |
|
|
|
if task.Status != models.GrampusStatusSucceeded { |
|
|
|
if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning { |
|
|
|
task.Duration = result.JobInfo.RunSec |
|
|
|
if task.Duration < 0 { |
|
|
|
task.Duration = 0 |
|
|
|
} |
|
|
|
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) |
|
|
|
|
|
|
|
if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { |
|
|
|
task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) |
|
|
|
} |
|
|
|
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { |
|
|
|
task.EndTime = task.StartTime.Add(task.Duration) |
|
|
|
} |
|
|
|
task.CorrectCreateUnix() |
|
|
|
err = models.UpdateJob(task) |
|
|
|
if err != nil { |
|
|
|
log.Error("UpdateJob failed:" + err.Error()) |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
task.Status = string(models.ModelSafetyTesting) |
|
|
|
err = models.UpdateJob(task) |
|
|
|
if err != nil { |
|
|
|
log.Error("UpdateJob failed:", err) |
|
|
|
} |
|
|
|
//send msg to beihang |
|
|
|
sendGPUInferenceResultToTest(task) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) { |
|
|
|
log.Info("The task not finished,name=" + job.DisplayJobName) |
|
|
|
result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10)) |
|
|
|
@@ -574,10 +522,10 @@ func AiSafetyCreateForGetGPU(ctx *context.Context) { |
|
|
|
ctx.Data["type"] = models.TypeCloudBrainOne |
|
|
|
ctx.Data["compute_resource"] = models.GPUResource |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID |
|
|
|
var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] |
|
|
|
ctx.Data["display_job_name"] = displayJobName |
|
|
|
prepareCloudbrainOneSpecs(ctx) |
|
|
|
@@ -587,43 +535,6 @@ func AiSafetyCreateForGetGPU(ctx *context.Context) { |
|
|
|
} |
|
|
|
ctx.HTML(200, tplModelSafetyTestCreateGpu) |
|
|
|
} |
|
|
|
func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
ctx.Data["IsCreate"] = true |
|
|
|
ctx.Data["type"] = models.TypeC2Net |
|
|
|
ctx.Data["compute_resource"] = models.GPUResource |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID |
|
|
|
err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("get new train-job info failed", err) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu) |
|
|
|
} |
|
|
|
|
|
|
|
func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
ctx.Data["IsCreate"] = true |
|
|
|
ctx.Data["type"] = models.TypeC2Net |
|
|
|
ctx.Data["compute_resource"] = models.NPUResource |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID |
|
|
|
|
|
|
|
err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("get new train-job info failed", err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu) |
|
|
|
} |
|
|
|
|
|
|
|
func AiSafetyCreateForGetNPU(ctx *context.Context) { |
|
|
|
t := time.Now() |
|
|
|
@@ -634,10 +545,10 @@ func AiSafetyCreateForGetNPU(ctx *context.Context) { |
|
|
|
var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] |
|
|
|
ctx.Data["display_job_name"] = displayJobName |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID |
|
|
|
|
|
|
|
var resourcePools modelarts.ResourcePool |
|
|
|
if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { |
|
|
|
@@ -733,167 +644,11 @@ func AiSafetyCreateForPost(ctx *context.Context) { |
|
|
|
} else if taskType == models.TypeCloudBrainOne { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
createForGPU(ctx, jobName) |
|
|
|
} else if taskType == models.TypeC2Net { |
|
|
|
ComputeResource := ctx.Query("compute_resource") |
|
|
|
if ComputeResource == models.NPUResource { |
|
|
|
createForGrampusNPU(ctx, jobName) |
|
|
|
} else if ComputeResource == models.GPUResource { |
|
|
|
createForGrampusGPU(ctx, jobName) |
|
|
|
} |
|
|
|
} |
|
|
|
log.Info("to redirect...") |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark") |
|
|
|
} |
|
|
|
|
|
|
|
func createForGrampusGPU(ctx *context.Context, jobName string) { |
|
|
|
BootFile := ctx.Query("boot_file") |
|
|
|
displayJobName := ctx.Query("display_job_name") |
|
|
|
description := ctx.Query("description") |
|
|
|
image := strings.TrimSpace(ctx.Query("image")) |
|
|
|
srcDataset := ctx.Query("src_dataset") //uuid |
|
|
|
combatDataset := ctx.Query("combat_dataset") //uuid |
|
|
|
evaluationIndex := ctx.Query("evaluation_index") |
|
|
|
Params := ctx.Query("run_para_list") |
|
|
|
specId := ctx.QueryInt64("spec_id") |
|
|
|
TrainUrl := ctx.Query("train_url") |
|
|
|
CkptName := ctx.Query("ckpt_name") |
|
|
|
ModelName := ctx.Query("model_name") |
|
|
|
ModelVersion := ctx.Query("model_version") |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" |
|
|
|
codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" |
|
|
|
//check specification |
|
|
|
spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{ |
|
|
|
JobType: models.JobTypeTrain, |
|
|
|
ComputeResource: models.GPU, |
|
|
|
Cluster: models.C2NetCluster, |
|
|
|
}) |
|
|
|
if err != nil || spec == nil { |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { |
|
|
|
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//check dataset |
|
|
|
uuid := srcDataset + ";" + combatDataset |
|
|
|
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//prepare code and out path |
|
|
|
_, err = ioutil.ReadDir(codeLocalPath) |
|
|
|
if err == nil { |
|
|
|
os.RemoveAll(codeLocalPath) |
|
|
|
} |
|
|
|
|
|
|
|
if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil { |
|
|
|
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//todo: upload code (send to file_server todo this work?) |
|
|
|
//upload code |
|
|
|
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" |
|
|
|
if err := mkModelPath(modelPath); err != nil { |
|
|
|
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//init model readme |
|
|
|
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
var datasetRemotePath, allFileName string |
|
|
|
for _, datasetInfo := range datasetInfos { |
|
|
|
if datasetRemotePath == "" { |
|
|
|
datasetRemotePath = datasetInfo.DataLocalPath |
|
|
|
allFileName = datasetInfo.FullName |
|
|
|
} else { |
|
|
|
datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath |
|
|
|
allFileName = allFileName + ";" + datasetInfo.FullName |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
//prepare command |
|
|
|
preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName) |
|
|
|
|
|
|
|
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName) |
|
|
|
|
|
|
|
req := &grampus.GenerateTrainJobReq{ |
|
|
|
JobName: jobName, |
|
|
|
DisplayJobName: displayJobName, |
|
|
|
ComputeResource: models.GPUResource, |
|
|
|
ProcessType: grampus.ProcessorTypeGPU, |
|
|
|
Command: command, |
|
|
|
ImageUrl: image, |
|
|
|
Description: description, |
|
|
|
BootFile: BootFile, |
|
|
|
Uuid: uuid, |
|
|
|
CommitID: commitID, |
|
|
|
BranchName: cloudbrain.DefaultBranchName, |
|
|
|
Params: Params, |
|
|
|
EngineName: image, |
|
|
|
DatasetNames: datasetNames, |
|
|
|
DatasetInfos: datasetInfos, |
|
|
|
|
|
|
|
IsLatestVersion: modelarts.IsLatestVersion, |
|
|
|
VersionCount: modelarts.VersionCountOne, |
|
|
|
WorkServerNumber: 1, |
|
|
|
Spec: spec, |
|
|
|
ModelName: ModelName, |
|
|
|
LabelName: evaluationIndex, |
|
|
|
CkptName: CkptName, |
|
|
|
ModelVersion: ModelVersion, |
|
|
|
PreTrainModelUrl: TrainUrl, |
|
|
|
} |
|
|
|
err = grampus.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) |
|
|
|
GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func createForGrampusNPU(ctx *context.Context, jobName string) { |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func createForNPU(ctx *context.Context, jobName string) { |
|
|
|
VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) |
|
|
|
BootFile := ctx.Query("boot_file") |
|
|
|
@@ -1214,7 +969,7 @@ func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, D |
|
|
|
|
|
|
|
func modelSafetyNewDataPrepare(ctx *context.Context) error { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
|
|
|
|
ctx.Data["type"] = ctx.QueryInt("type") |
|
|
|
ctx.Data["boot_file"] = ctx.Query("boot_file") |
|
|
|
ctx.Data["display_job_name"] = ctx.Query("display_job_name") |
|
|
|
ctx.Data["description"] = ctx.Query("description") |
|
|
|
@@ -1232,10 +987,17 @@ func modelSafetyNewDataPrepare(ctx *context.Context) error { |
|
|
|
ctx.Data["model_name"] = ctx.Query("model_name") |
|
|
|
ctx.Data["model_version"] = ctx.Query("model_version") |
|
|
|
|
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID |
|
|
|
if ctx.QueryInt("type") == models.TypeCloudBrainOne { |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID |
|
|
|
} else { |
|
|
|
ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName |
|
|
|
ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID |
|
|
|
ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName |
|
|
|
ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID |
|
|
|
} |
|
|
|
|
|
|
|
prepareCloudbrainOneSpecs(ctx) |
|
|
|
|
|
|
|
|