Browse Source

提交代码。

Signed-off-by: zouap <zouap@pcl.ac.cn>
tags/v1.22.7.1
zouap 3 years ago
parent
commit
090b437b15
2 changed files with 209 additions and 213 deletions
  1. +108
    -132
      modules/modelarts/modelarts.go
  2. +101
    -81
      routers/repo/modelarts.go

+ 108
- 132
modules/modelarts/modelarts.go View File

@@ -22,7 +22,7 @@ const (
storageTypeOBS = "obs"
autoStopDuration = 4 * 60 * 60
autoStopDurationMs = 4 * 60 * 60 * 1000
MORDELART_USER_IMAGE_ENGINE_ID = 1000000
MORDELART_USER_IMAGE_ENGINE_ID = -1
DataSetMountPath = "/home/ma-user/work"
NotebookEnv = "Python3"
NotebookType = "Ascend"
@@ -315,106 +315,58 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
return nil
}

func GenerateTrainJobByUserImage(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

jobId := strconv.FormatInt(jobResult.JobID, 10)
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobId,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: models.NPUResource,
EngineID: MORDELART_USER_IMAGE_ENGINE_ID,
Image: req.UserImageUrl,
UserImageCommand: req.UserCommand,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
EngineName: req.EngineName,
VersionCount: req.VersionCount,
TotalVersionCount: req.TotalVersionCount,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
return err
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
return nil
}

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
var jobResult *models.CreateTrainJobResult
var createErr error
if req.EngineID < 0 {
jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
Parameter: req.Parameters,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
})
} else {
jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
},
})
}
if createErr != nil {
log.Error("CreateJob failed: %v", createErr.Error())
return createErr
}

jobId := strconv.FormatInt(jobResult.JobID, 10)
err = models.CreateCloudbrain(&models.Cloudbrain{
createErr = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
@@ -448,9 +400,9 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
UpdatedUnix: createTime,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
return err
if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
return createErr
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
return nil
@@ -482,45 +434,69 @@ func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrain

func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
var jobResult *models.CreateTrainJobResult
var createErr error
if req.EngineID < 0 {
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
},
}, jobId)
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}, jobId)
} else {
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
},
}, jobId)
}
if createErr != nil {
log.Error("CreateJob failed: %v", createErr.Error())
return createErr
}

var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeTrain))
repo := ctx.Repo.Repository
VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: strconv.FormatInt(jobResult.JobID, 10),
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return err
if createErr != nil {
ctx.ServerError("Cloudbrain", createErr)
return createErr
}
//将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount

err = models.CreateCloudbrain(&models.Cloudbrain{
createErr = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
@@ -555,19 +531,19 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
return createErr
}

//将训练任务的上一版本的isLatestVersion设置为"0"
err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
if err != nil {
ctx.ServerError("Update IsLatestVersion failed", err)
return err
createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
if createErr != nil {
ctx.ServerError("Update IsLatestVersion failed", createErr)
return createErr
}

return err
return createErr
}

func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {


+ 101
- 81
routers/repo/modelarts.go View File

@@ -979,7 +979,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
isSaveParam := form.IsSaveParam
//isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
@@ -1124,45 +1124,45 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
}

//save param config
if isSaveParam == "on" {
saveparams := append(param, models.Parameter{
Label: modelarts.TrainUrl,
Value: outputObsPath,
}, models.Parameter{
Label: modelarts.DataUrl,
Value: dataPath,
})
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
trainJobNewDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
return
}
_, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
ConfigName: form.ParameterTemplateName,
Description: form.PrameterDescription,
DataUrl: dataPath,
AppUrl: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
TrainUrl: outputObsPath,
Flavor: models.Flavor{
Code: flavorCode,
},
WorkServerNum: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Parameter: saveparams,
})
if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
return
}
}
// if isSaveParam == "on" {
// saveparams := append(param, models.Parameter{
// Label: modelarts.TrainUrl,
// Value: outputObsPath,
// }, models.Parameter{
// Label: modelarts.DataUrl,
// Value: dataPath,
// })
// if form.ParameterTemplateName == "" {
// log.Error("ParameterTemplateName is empty")
// trainJobNewDataPrepare(ctx)
// ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
// return
// }
// _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
// ConfigName: form.ParameterTemplateName,
// Description: form.PrameterDescription,
// DataUrl: dataPath,
// AppUrl: codeObsPath,
// BootFileUrl: codeObsPath + bootFile,
// TrainUrl: outputObsPath,
// Flavor: models.Flavor{
// Code: flavorCode,
// },
// WorkServerNum: workServerNumber,
// EngineID: int64(engineID),
// LogUrl: logObsPath,
// PoolID: poolID,
// Parameter: saveparams,
// })
// if err != nil {
// log.Error("Failed to CreateTrainJobConfig: %v", err)
// trainJobErrorNewDataPrepare(ctx, form)
// ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
// return
// }
// }

req := &modelarts.GenerateTrainJobReq{
JobName: jobName,
@@ -1208,6 +1208,26 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func getUserCommand(engineId int64, req *modelarts.GenerateTrainJobReq) (string, string) {
userImageUrl := ""
userCommand := ""
if engineId < 0 {
userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' 'code/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'"
var engines modelarts.Engine
if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
log.Info("json.Unmarshal failed:", err)
}
for _, engine := range engines.Info {
if engine.ID == int(engineId) {
userImageUrl = engine.Url
break
}
}
return userCommand, userImageUrl
}
return userCommand, userImageUrl
}

func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")
@@ -1244,7 +1264,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
isSaveParam := form.IsSaveParam
//isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
@@ -1364,46 +1384,46 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
})
}

//save param config
if isSaveParam == "on" {
saveparams := append(param, models.Parameter{
Label: modelarts.TrainUrl,
Value: outputObsPath,
}, models.Parameter{
Label: modelarts.DataUrl,
Value: dataPath,
})
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
return
}
_, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
ConfigName: form.ParameterTemplateName,
Description: form.PrameterDescription,
DataUrl: dataPath,
AppUrl: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
TrainUrl: outputObsPath,
Flavor: models.Flavor{
Code: flavorCode,
},
WorkServerNum: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Parameter: saveparams,
})
if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
}
// //save param config
// if isSaveParam == "on" {
// saveparams := append(param, models.Parameter{
// Label: modelarts.TrainUrl,
// Value: outputObsPath,
// }, models.Parameter{
// Label: modelarts.DataUrl,
// Value: dataPath,
// })
// if form.ParameterTemplateName == "" {
// log.Error("ParameterTemplateName is empty")
// versionErrorDataPrepare(ctx, form)
// ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
// return
// }
// _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
// ConfigName: form.ParameterTemplateName,
// Description: form.PrameterDescription,
// DataUrl: dataPath,
// AppUrl: codeObsPath,
// BootFileUrl: codeObsPath + bootFile,
// TrainUrl: outputObsPath,
// Flavor: models.Flavor{
// Code: flavorCode,
// },
// WorkServerNum: workServerNumber,
// EngineID: int64(engineID),
// LogUrl: logObsPath,
// PoolID: poolID,
// Parameter: saveparams,
// })
// if err != nil {
// log.Error("Failed to CreateTrainJobConfig: %v", err)
// versionErrorDataPrepare(ctx, form)
// ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
// return
// }
// }

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
if err != nil {


Loading…
Cancel
Save