|
|
|
@@ -763,9 +763,23 @@ func trainJobNewDataPrepare(ctx *context.Context) error { |
|
|
|
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") |
|
|
|
ctx.Data["WaitCount"] = waitCount |
|
|
|
|
|
|
|
setMultiNodeIfConfigureMatch(ctx) |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func setMultiNodeIfConfigureMatch(ctx *context.Context) { |
|
|
|
modelarts.InitMultiNode() |
|
|
|
if modelarts.MultiNodeConfig != nil { |
|
|
|
for _, info := range modelarts.MultiNodeConfig.Info { |
|
|
|
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { |
|
|
|
ctx.Data["WorkNode"] = info.Node |
|
|
|
break |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { |
|
|
|
modelarts.InitSpecialPool() |
|
|
|
|
|
|
|
@@ -880,6 +894,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") |
|
|
|
ctx.Data["WaitCount"] = waitCount |
|
|
|
setMultiNodeIfConfigureMatch(ctx) |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
@@ -1115,6 +1130,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) |
|
|
|
VersionCount := modelarts.VersionCountOne |
|
|
|
EngineName := form.EngineName |
|
|
|
|
|
|
|
errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) |
|
|
|
if errStr!=""{ |
|
|
|
trainJobErrorNewDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
@@ -1145,7 +1167,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) |
|
|
|
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) |
|
|
|
if errStr != "" { |
|
|
|
trainJobErrorNewDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) |
|
|
|
@@ -1349,6 +1371,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") |
|
|
|
} |
|
|
|
|
|
|
|
func checkMultiNode(userId int64, serverNum int) string{ |
|
|
|
if serverNum==1{ |
|
|
|
return "" |
|
|
|
} |
|
|
|
modelarts.InitMultiNode() |
|
|
|
var isServerNumValid=false |
|
|
|
if modelarts.MultiNodeConfig != nil { |
|
|
|
for _, info := range modelarts.MultiNodeConfig.Info { |
|
|
|
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { |
|
|
|
if isInNodes(info.Node,serverNum){ |
|
|
|
isServerNumValid=true |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if isServerNumValid{ |
|
|
|
return "" |
|
|
|
}else{ |
|
|
|
return "repo.modelarts.no_node_right" |
|
|
|
} |
|
|
|
} |
|
|
|
func checkInferenceJobMultiNode(userId int64, serverNum int) string{ |
|
|
|
if serverNum==1{ |
|
|
|
return "" |
|
|
|
} |
|
|
|
|
|
|
|
return "repo.modelarts.no_node_right" |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func isInNodes(nodes []int, num int) bool { |
|
|
|
for _, node:=range nodes{ |
|
|
|
if node==num{ |
|
|
|
return true |
|
|
|
} |
|
|
|
} |
|
|
|
return false |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { |
|
|
|
userImageUrl := "" |
|
|
|
userCommand := "" |
|
|
|
@@ -1383,6 +1447,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ |
|
|
|
ctx.Data["PageIsTrainJob"] = true |
|
|
|
var jobID = ctx.Params(":jobid") |
|
|
|
|
|
|
|
errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) |
|
|
|
if errStr!=""{ |
|
|
|
versionErrorDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
@@ -1450,7 +1521,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) |
|
|
|
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) |
|
|
|
if errStr != "" { |
|
|
|
versionErrorDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) |
|
|
|
@@ -1718,11 +1789,7 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { |
|
|
|
log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile)) |
|
|
|
return errors.New("启动文件必须是python文件") |
|
|
|
} |
|
|
|
|
|
|
|
if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 { |
|
|
|
log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber) |
|
|
|
return errors.New("计算节点数必须在1-2之间") |
|
|
|
} |
|
|
|
|
|
|
|
if form.BranchName == "" { |
|
|
|
log.Error("the branch must not be null!", form.BranchName) |
|
|
|
return errors.New("代码分支不能为空!") |
|
|
|
@@ -2021,6 +2088,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference |
|
|
|
ckptUrl := "/" + form.TrainUrl + form.CkptName |
|
|
|
log.Info("ckpt url:" + ckptUrl) |
|
|
|
|
|
|
|
errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber) |
|
|
|
if errStr!=""{ |
|
|
|
inferenceJobErrorNewDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
@@ -2069,7 +2143,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) |
|
|
|
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) |
|
|
|
if errStr != "" { |
|
|
|
inferenceJobErrorNewDataPrepare(ctx, form) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) |
|
|
|
|