Browse Source

create

tags/v1.22.6.2
lewis 3 years ago
parent
commit
10e261c2e8
6 changed files with 199 additions and 199 deletions
  1. +28
    -0
      models/cloudbrain.go
  2. +12
    -10
      modules/auth/grampus.go
  3. +24
    -24
      modules/grampus/grampus.go
  4. +41
    -18
      modules/grampus/resty.go
  5. +91
    -119
      routers/repo/grampus.go
  6. +3
    -28
      templates/repo/grampus/trainjob/npu/new.tmpl

+ 28
- 0
models/cloudbrain.go View File

@@ -146,6 +146,7 @@ type Cloudbrain struct {
PreVersionName string //父版本名称
ComputeResource string //计算资源,例如npu
EngineID int64 //引擎id
ImageID string //grampus image_id

TrainUrl string //输出模型的obs路径
BranchName string //分支名称
@@ -1180,6 +1181,33 @@ type GrampusJobInfo struct {
Tasks []GrampusTasks `json:"tasks"`
}

type GrampusSpec struct {
CreatedAt int64 `json:"createdAt"`
UpdatedAt int64 `json:"updatedAt"`
ID string `json:"id"`
Name string `json:"name"`
ProcessorType string `json:"processorType"`
}

type GetGrampusResourceSpecsResult struct {
GrampusResult
Infos []GrampusSpec `json:"resourceSpecs"`
}

type GrampusImage struct {
CreatedAt int64 `json:"createdAt"`
UpdatedAt int64 `json:"updatedAt"`
ID string `json:"id"`
Name string `json:"name"`
ProcessorType string `json:"processorType"`
}

type GetGrampusImagesResult struct {
GrampusResult
TotalSize int `json:"totalSize"`
Infos []GrampusImage `json:"images"`
}

type CreateGrampusJobResponse struct {
GrampusResult
JobInfo GrampusJobInfo `json:"otJob"`


+ 12
- 10
modules/auth/grampus.go View File

@@ -6,16 +6,18 @@ import (
)

type CreateGrampusTrainJobForm struct {
DisplayJobName string `form:"display_job_name" binding:"Required"`
JobName string `form:"job_name" binding:"Required"`
Attachment string `form:"attachment" binding:"Required"`
BootFile string `form:"boot_file" binding:"Required"`
Flavor string `form:"flavor" binding:"Required"`
Params string `form:"run_para_list" binding:"Required"`
Description string `form:"description"`
BranchName string `form:"branch_name" binding:"Required"`
FlavorName string `form:"flaver_names" binding:"Required"`
EngineName string `form:"engine_names" binding:"Required"`
DisplayJobName string `form:"display_job_name" binding:"Required"`
JobName string `form:"job_name" binding:"Required"`
Attachment string `form:"attachment" binding:"Required"`
BootFile string `form:"boot_file" binding:"Required"`
ImageID string `form:"image_id" binding:"Required"`
FlavorID string `form:"flavor" binding:"Required"`
Params string `form:"run_para_list" binding:"Required"`
Description string `form:"description"`
BranchName string `form:"branch_name" binding:"Required"`
FlavorName string `form:"flaver_names" binding:"Required"`
EngineName string `form:"engine_names" binding:"Required"`
WorkServerNumber int `form:"work_server_number" binding:"Required"`
}

func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {


+ 24
- 24
modules/grampus/grampus.go View File

@@ -42,6 +42,9 @@ const (
SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
TotalVersionCount = 1

ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
)

var (
@@ -54,7 +57,7 @@ type GenerateTrainJobReq struct {
JobName string
Command string
ResourceSpecId string
ImageUrl string
ImageUrl string //与image_id二选一,都有的情况下优先image_url
ImageId string

DisplayJobName string
@@ -102,29 +105,26 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error

jobID := jobResult.JobInfo.JobID
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.JobInfo.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainGrampus,
//VersionID: jobResult.VersionID,
//VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: req.ComputeResource,
//EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
//LogUrl: req.LogUrl,
//FlavorCode: req.FlavorCode,
Status: TransTrainJobStatus(jobResult.JobInfo.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainGrampus,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: req.ComputeResource,
ImageID: req.ImageId,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
FlavorCode: req.ResourceSpecId,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,


+ 41
- 18
modules/grampus/resty.go View File

@@ -22,7 +22,8 @@ const (

urlGetToken = urlOpenApiV1 + "token"
urlTrainJob = urlOpenApiV1 + "trainjob"
urlResourceSpecs = "/job/resource-specs"
urlGetResourceSpecs = urlOpenApiV1 + "resourcespec"
urlGetImages = urlOpenApiV1 + "image"
urlTrainJobConfig = "/training-job-configs"
errorCodeExceedLimit = "ModelArts.0118"

@@ -155,43 +156,65 @@ sendjob:
return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
func GetResourceSpecs(processorType string) (*models.GetGrampusResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetResourceSpecsResult
var result models.GetGrampusResourceSpecsResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
_, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)
Get(HOST + urlGetResourceSpecs + "?processorType=" + processorType)

if err != nil {
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if result.ErrorCode != 0 {
log.Error("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetImages(processorType string) (*models.GetGrampusImagesResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusImagesResult

retry := 0

sendjob:
_, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlGetImages + "?processorType=" + processorType)

if err != nil {
return nil, fmt.Errorf("resty GetImages: %v", err)
}

if !result.IsSuccess {
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil


+ 91
- 119
routers/repo/grampus.go View File

@@ -135,18 +135,13 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec
ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName

ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled

ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled

ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne

ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
if err != nil {
log.Error("GetBranches error:", err)
}
ctx.Data["branches"] = branches
ctx.Data["branchName"] = ctx.Repo.BranchName

return nil
}
@@ -170,51 +165,37 @@ func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error {
//get valid dataset
attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
if err != nil {
ctx.ServerError("GetAllUserAttachments failed:", err)
return err
}
ctx.Data["attachments"] = attachs

//get valid resource specs
var resourcePools modelarts.ResourcePool
if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
log.Error("GetModelArtsTrainAttachments failed:", err.Error())
} else {
ctx.Data["attachments"] = attachs
}
ctx.Data["resource_pools"] = resourcePools.Info

var engines modelarts.Engine
if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
//get valid engines
images, err := grampus.GetImages(grampus.ProcessorTypeNPU)
if err != nil {
log.Error("GetResourceSpecs failed:", err.Error())
} else {
ctx.Data["engine_versions"] = images.Infos
}
ctx.Data["engines"] = engines.Info

var versionInfos modelarts.VersionInfo
if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
//get valid resource specs
specs, err := grampus.GetResourceSpecs(grampus.ProcessorTypeNPU)
if err != nil {
log.Error("GetResourceSpecs failed:", err.Error())
} else {
ctx.Data["flavor_infos"] = specs.Infos
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
//get branches
branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
if err != nil {
log.Error("GetBranches error:", err.Error())
} else {
ctx.Data["branches"] = branches
}
ctx.Data["flavor_infos"] = flavorInfos.Info

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
ctx.ServerError("getConfigList failed:", err)
return err
}
ctx.Data["config_list"] = configList.ParaConfigs
ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo

return nil
}

@@ -246,10 +227,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
FlavorName := form.FlavorName
VersionCount := modelarts.VersionCount
EngineName := form.EngineName
flavorName := form.FlavorName
versionCount := modelarts.VersionCount
engineName := form.EngineName

//check count limit
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -265,12 +247,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
}

//check param
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}

//check whether the task name in the project is duplicated
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
if err == nil {
@@ -295,9 +279,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
os.RemoveAll(codeLocalPath)
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branchName)

if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
@@ -321,7 +302,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
// if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
@@ -330,9 +310,9 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain

//prepare command
//todo: download code, download dataset, unzip dataset, exec code, upload model
command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "")
var parameters models.Parameters
param := make([]models.Parameter, 0)
existDeviceTarget := false
if len(params) != 0 {
err := json.Unmarshal([]byte(params), &parameters)
if err != nil {
@@ -343,63 +323,45 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}

for _, parameter := range parameters.Parameter {
if parameter.Label == modelarts.DeviceTarget {
existDeviceTarget = true
}
if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
param = append(param, models.Parameter{
Label: parameter.Label,
Value: parameter.Value,
})
}
param = append(param, models.Parameter{
Label: parameter.Label,
Value: parameter.Value,
})
}
}
if !existDeviceTarget {
param = append(param, models.Parameter{
Label: modelarts.DeviceTarget,
Value: modelarts.Ascend,
})
}
param = append(param, models.Parameter{
Label: modelarts.DeviceTarget,
Value: modelarts.Ascend,
})

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branchName)

req := &grampus.GenerateTrainJobReq{
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: models.NPUResource,
Command: "echo test",
ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7",
ImageUrl: "",
ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95",

DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile,
//TrainUrl: outputObsPath,
//FlavorCode: flavorCode,
WorkServerNumber: 1,
//EngineID: int64(engineID),
//LogUrl: logObsPath,
//PoolID: poolID,
Uuid: uuid,
//Parameters: param,
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: models.NPUResource,
Command: command,
ResourceSpecId: form.FlavorID,
ImageUrl: "",
ImageId: form.ImageID,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile,
WorkServerNumber: form.WorkServerNumber,
Uuid: uuid,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
BranchName: branchName,
Params: form.Params,
FlavorName: FlavorName,
EngineName: EngineName,
VersionCount: VersionCount,
FlavorName: flavorName,
EngineName: engineName,
VersionCount: versionCount,
TotalVersionCount: modelarts.TotalVersionCount,
}

//将params转换Parameters.Parameter,出错时返回给前端
var Parameters modelarts.Parameters
if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return
}

err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
@@ -517,25 +479,6 @@ func GrampusTrainJobShow(ctx *context.Context) {
task.DatasetName = attachment.Name
}

if len(task.Parameters) > 0 {
var parameters models.Parameters
err := json.Unmarshal([]byte(task.Parameters), &parameters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.ServerError("system error", err)
return
}

if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
}
}

if task.DeletedAt.IsZero() { //normal record
result, err := grampus.GetJob(task.JobID)
if err != nil {
@@ -565,6 +508,25 @@ func GrampusTrainJobShow(ctx *context.Context) {
}
}

if len(task.Parameters) > 0 {
var parameters models.Parameters
err := json.Unmarshal([]byte(task.Parameters), &parameters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.ServerError("system error", err)
return
}

if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
}
}

taskList := make([]*models.Cloudbrain, 0)
taskList = append(taskList, task)
ctx.Data["version_list_task"] = taskList
@@ -595,3 +557,13 @@ func GrampusGetLog(ctx *context.Context) {

return
}

func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) {
var command string
//download code
//download dataset
//unzip dataset
//exec code
//upload models
return command, nil
}

+ 3
- 28
templates/repo/grampus/trainjob/npu/new.tmpl View File

@@ -136,18 +136,10 @@

<div class="required unite min_title inline fields" style="width: 90%;">
<label style="font-weight: normal;">{{.i18n.Tr "repo.modelarts.train_job.AI_driver"}}&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</label>
<div class="field" style="flex: 1.5;">
<select class="ui dropdown width" id="trainjob_engines" >
{{range .engines}}
<option value="{{.Value}}">{{.Value}}</option>
{{end}}
</select>
</div>

<div class="field" style="flex: 2;" id="engine_name">
<select class="ui dropdown width" id="trainjob_engine_versions" style='width: 100%;' name="engine_id">
<select class="ui dropdown width" id="trainjob_engine_versions" style='width: 100%;' name="image_id">
{{range .engine_versions}}
<option name="engine_id" value="{{.ID}}">{{.Value}}</option>
<option name="image_id" value="{{.ID}}">{{.Name}}</option>
{{end}}
</select>

@@ -175,22 +167,6 @@
<span id="add_run_para" style="margin-left: 0.5rem;cursor:pointer;color: rgba(3, 102, 214, 100);font-size: 14px;line-height: 26px;font-family: SourceHanSansSC-medium;"><i class="plus square outline icon"></i>{{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}</span>
<input id="store_run_para" type="hidden" name="run_para_list">
<div class="dynamic field" style="margin-top: 1rem;">
{{if ne 0 (len .params)}}
{{range $k ,$v := .params}}
<div class="two fields width85" id="para{{$k}}">
<div class="field">
<input type="text" name="shipping_first-name" value={{$v.Label}} required>
</div>
<div class="field">
<input type="text" name="shipping_last-name" value={{$v.Value}} required>
</div>
<span>
<i class="trash icon"></i>
</span>

</div>
{{end}}
{{end}}
</div>
</div>

@@ -224,7 +200,7 @@
<label style="font-weight: normal;">{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label>
<select class="ui dropdown width81" id="trainjob-flavor" style='width:385px' name="flavor">
{{range .flavor_infos}}
<option name="flavor" value="{{.Code}}">{{.Value}}</option>
<option name="flavor" value="{{.ID}}">{{.Name}}</option>
{{end}}
</select>
</div>
@@ -237,7 +213,6 @@
<div class="field" id="trainjob_work_server_num_select" name="work_server_number_select">
<select class="ui dropdown width" style='width: 100%;' name="work_server_id">
<option name="server_id" value="1">1</option>
<option name="server_id" value="2">2</option>
</select>
</div>



Loading…
Cancel
Save