Browse Source

提交代码。

Signed-off-by: zouap <zouap@pcl.ac.cn>
tags/v1.21.12.1^2
zouap 4 years ago
parent
commit
44ea702590
8 changed files with 796 additions and 77 deletions
  1. +165
    -14
      models/cloudbrain.go
  2. +1
    -0
      models/models.go
  3. +2
    -0
      modules/auth/modelarts.go
  4. +156
    -26
      modules/modelarts/modelarts.go
  5. +46
    -0
      modules/modelarts/resty.go
  6. +418
    -27
      routers/repo/modelarts.go
  7. +8
    -10
      routers/routes/routes.go
  8. +0
    -0
      templates/repo/modelarts/trainjob/version_new.tmpl

+ 165
- 14
models/cloudbrain.go View File

@@ -30,6 +30,7 @@ const (
JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
JobTypeBrainScore JobType = "BRAINSCORE"
JobTypeTrain JobType = "TRAIN"
JobVersionName JobType = "V0001"

ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
@@ -68,10 +69,35 @@ type Cloudbrain struct {
CanDel bool `xorm:"-"`
Type int `xorm:"INDEX DEFAULT 0"`

VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionCount int64 `xorm:"INDEX DEFAULT 1"`
IsLatestVersion string
CommitID string
FatherVersionName string
ComputeResource string
EngineID int64

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
}

type TrainjobConfigDetail struct {
ID int64 `xorm:"pk autoincr"`
JobID string `xorm:"INDEX"`
JobName string `xorm:"INDEX"`
ResourcePools string `xorm:"INDEX"`
EngineVersions int `xorm:"INDEX"`
FlavorInfos string `xorm:"INDEX"`
TrainUrl string `xorm:"INDEX"`
BootFile string `xorm:"INDEX"`
Uuid string `xorm:"INDEX"`
DatasetName string `xorm:"INDEX"`
Params string `xorm:"INDEX"`
BranchName string `xorm:"INDEX"`
VersionName string `xorm:"INDEX"`

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
@@ -150,13 +176,16 @@ type CloudbrainsOptions struct {
ListOptions
RepoID int64 // include all repos if empty
UserID int64
JobID int64
JobID string
SortType string
CloudbrainIDs []int64
// JobStatus CloudbrainStatus
Type int
JobType string
Type int
JobType string
VersionName string
IsLatestVersion string
}

type TaskPod struct {
TaskRoleStatus struct {
Name string `json:"name"`
@@ -594,6 +623,33 @@ type Config struct {
PoolID string `json:"pool_id"`
}

type CreateTrainJobVersionParams struct {
Description string `json:"job_desc"`
Config TrainJobVersionConfig `json:"config"`
}

type TrainJobVersionConfig struct {
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
Parameter []Parameter `json:"parameter"`
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
//DatasetID string `json:"dataset_id"`
//DataVersionID string `json:"dataset_version_id"`
//DataSource []DataSource `json:"data_source"`
//SpecID int64 `json:"spec_id"`
EngineID int64 `json:"engine_id"`
//ModelID int64 `json:"model_id"`
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
LogUrl string `json:"log_url"`
//UserImageUrl string `json:"user_image_url"`
//UserCommand string `json:"user_command"`
//Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
PreVersionId int64 `json:"pre_version_id"`
}

type CreateConfigParams struct {
ConfigName string `json:"config_name"`
Description string `json:"config_desc"`
@@ -784,12 +840,13 @@ type GetTrainJobResult struct {
//UserImageUrl string `json:"user_image_url"`
//UserCommand string `json:"user_command"`
//Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
PoolName string `json:"pool_name"`
NasMountPath string `json:"nas_mount_path"`
NasShareAddr string `json:"nas_share_addr"`
DatasetName string
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
PoolName string `json:"pool_name"`
NasMountPath string `json:"nas_mount_path"`
NasShareAddr string `json:"nas_share_addr"`
DatasetName string
ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
}

type GetTrainJobLogResult struct {
@@ -836,7 +893,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
)
}

if (opts.JobID) > 0 {
if (opts.JobID) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_id": opts.JobID},
)
@@ -854,6 +911,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
)
}

if (opts.IsLatestVersion) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
)
}

// switch opts.JobStatus {
// case JobWaiting:
// cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
@@ -896,11 +959,83 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
return cloudbrains, count, nil
}

func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()

var cond = builder.NewCond()
if opts.RepoID > 0 {
cond = cond.And(
builder.Eq{"cloudbrain.repo_id": opts.RepoID},
)
}

if opts.UserID > 0 {
cond = cond.And(
builder.Eq{"cloudbrain.user_id": opts.UserID},
)
}

if (opts.Type) >= 0 {
cond = cond.And(
builder.Eq{"cloudbrain.type": opts.Type},
)
}

if (opts.JobID) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_id": opts.JobID},
)
}

if (opts.JobType) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_type": opts.JobType},
)
}

if len(opts.CloudbrainIDs) > 0 {
cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
}

count, err := sess.Where(cond).Count(new(Cloudbrain))
if err != nil {
return nil, 0, fmt.Errorf("Count: %v", err)
}

if opts.Page >= 0 && opts.PageSize > 0 {
var start int
if opts.Page == 0 {
start = 0
} else {
start = (opts.Page - 1) * opts.PageSize
}
sess.Limit(opts.PageSize, start)
}

sess.OrderBy("cloudbrain.created_unix DESC")
cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
if err := sess.Table(&Cloudbrain{}).Where(cond).
Join("left", "`user`", "cloudbrain.user_id = `user`.id").
Find(&cloudbrains); err != nil {
return nil, 0, fmt.Errorf("Find: %v", err)
}
sess.Close()

return cloudbrains, count, nil
}

func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
if _, err = x.Insert(cloudbrain); err != nil {
return err
}
return nil
}

func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) {
if _, err = x.Insert(trainjobConfigDetail); err != nil {
return err
}
return nil
}

@@ -924,6 +1059,16 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}

func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
return getRepoCloudBrain(cb)
}

func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
return getRepoCloudBrain(cb)
}

func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
@@ -948,6 +1093,12 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train
return
}

func SetVersionCountAndLatestVersionByJobIDAndVersionName(jobID string, versionName string, versionCount int64, isLatestVersion string) (err error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion}
_, err = x.Cols("version_Count", "is_latest_version").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
return
}

func UpdateJob(job *Cloudbrain) error {
return updateJob(x, job)
}


+ 1
- 0
models/models.go View File

@@ -134,6 +134,7 @@ func init() {
new(BlockChain),
new(RecommendOrg),
new(AiModelManage),
new(TrainjobConfigDetail),
)

tablesStatistic = append(tablesStatistic,


+ 2
- 0
modules/auth/modelarts.go View File

@@ -38,6 +38,8 @@ type CreateModelArtsTrainJobForm struct {
IsSaveParam string `form:"is_save_para"`
ParameterTemplateName string `form:"parameter_template_name"`
PrameterDescription string `form:"parameter_description"`
BranchName string `form:"branch_name" binding:"Required"`
VersionName string `form:"version_name" binding:"Required"`
}

func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {


+ 156
- 26
modules/modelarts/modelarts.go View File

@@ -35,16 +35,19 @@ const (
// "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
// "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
// "]}"
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
IsLatestVersion = "1"
NotLatestVersion = "0"
ComputeResource = "NPU"

SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
@@ -69,6 +72,26 @@ type GenerateTrainJobReq struct {
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
CommitID string
IsLatestVersion string
}

type GenerateTrainJobVersionReq struct {
JobName string
Uuid string
Description string
CodeObsPath string
BootFile string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
PreVersionId int64
CommitID string
}

type VersionInfo struct {
@@ -170,7 +193,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error
return nil
}

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult *models.CreateTrainJobResult, err error) {
jobResult, err := createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
@@ -192,35 +215,142 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
return nil, err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
return nil
return nil, err
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: ComputeResource,
EngineID: req.EngineID,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
return nil, err
}

return nil
return jobResult, nil
}

func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (jobresult *models.CreateTrainJobResult, err error) {
jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFile,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
},
}, jobId)
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return nil, err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
return nil, err
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
FatherVersionName: fatherVersionName,
ComputeResource: ComputeResource,
EngineID: req.EngineID,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return nil, err
}

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
if page <= 0 {
page = 1
}
_, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
JobID: strconv.FormatInt(jobResult.JobID, 10),
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return nil, err
}

//将训练任务的上一版本的isLatestVersion设置为"0"
latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion)
if err != nil {
ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
return nil, err
}
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil, err
}

// lastVersionNum := jobResult.VersionName[1:]
// lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64)
// if err != nil {
// ctx.ServerError("lastVersionNumToInt64 faild:", err)
// return nil
// }
// lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10)
//将训练任务的本版本的isLatestVersion设置为"0"

//将当前版本的isLatestVersion和任务数量更新
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil, err
}

return jobResult, err
}

func TransTrainJobStatus(status int) string {


+ 46
- 0
modules/modelarts/resty.go View File

@@ -377,6 +377,52 @@ sendjob:
return &result, nil
}

func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

req, _ := json.Marshal(createJobVersionParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()


+ 418
- 27
routers/repo/modelarts.go View File

@@ -40,6 +40,7 @@ const (
tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index"
tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
)

// MustEnableDataset check if repository enable internal cb
@@ -493,14 +494,6 @@ func NotebookDel(ctx *context.Context) {
func TrainJobIndex(ctx *context.Context) {
MustEnableModelArts(ctx)

//can, err := canUserCreateTrainJob(ctx.User.ID)
//if err != nil {
// ctx.ServerError("canUserCreateTrainJob", err)
// return
//}
//
//ctx.Data["CanCreate"] = can

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
if page <= 0 {
@@ -512,9 +505,10 @@ func TrainJobIndex(ctx *context.Context) {
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
IsLatestVersion: modelarts.IsLatestVersion,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
@@ -596,12 +590,96 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
ctx.Data["train_url"] = outputObsPath

Branches, err := ctx.Repo.GitRepo.GetBranches()
if err != nil {
ctx.ServerError("GetBranches error:", err)
return err
}
ctx.Data["Branches"] = Branches
ctx.Data["BranchesCount"] = len(Branches)

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
ctx.ServerError("getConfigList failed:", err)
return err
}
ctx.Data["config_list"] = configList.ParaConfigs

return nil
}

func TrainJobNewVersion(ctx *context.Context) {
err := trainJobNewVersionDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
ctx.HTML(200, tplModelArtsTrainJobVersionNew)
}

func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")
jobID = "19373"

t := time.Now()
var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
ctx.Data["job_name"] = jobName

attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
if err != nil {
ctx.ServerError("GetAllUserAttachments failed:", err)
return err
}
ctx.Data["attachments"] = attachs

var resourcePools modelarts.ResourcePool
if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["resource_pools"] = resourcePools.Info

var engines modelarts.Engine
if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engines"] = engines.Info

var versionInfos modelarts.VersionInfo
if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info

outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
ctx.Data["train_url"] = outputObsPath

Branches, err := ctx.Repo.GitRepo.GetBranches()
if err != nil {
ctx.ServerError("GetBranches error:", err)
return err
}
ctx.Data["Branches"] = Branches
ctx.Data["BranchesCount"] = len(Branches)
ctx.Data["jobID"] = jobID
ctx.Data["versionName"] = versionName

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
ctx.ServerError("getConfigList failed:", err)
return err
}
ctx.Data["config_list"] = configList.ParaConfigs

return nil
@@ -625,20 +703,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
isLatestVersion := modelarts.IsLatestVersion

//can, err := canUserCreateTrainJob(ctx.User.ID)
//if err != nil {
// ctx.ServerError("canUserCreateTrainJob", err)
// return
//}
//
//if !can {
// log.Error("the user can not create train-job")
// ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form)
// return
//}

//param check
if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewDataPrepare(ctx)
@@ -657,7 +724,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
if err == nil {
os.RemoveAll(codeLocalPath)
}
if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil {

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)

if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name,
}); err != nil {
log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
trainJobNewDataPrepare(ctx)

@@ -665,6 +738,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
trainJobNewDataPrepare(ctx)
// ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form)
@@ -771,10 +845,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: param,
Parameters: parameters.Parameter,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
}

err = modelarts.GenerateTrainJob(ctx, req)
jobResult, err := modelarts.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewDataPrepare(ctx)
@@ -782,12 +858,258 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}
// 保存openi创建训练任务界面的参数
err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

JobName: req.JobName,
JobID: strconv.FormatInt(jobResult.JobID, 10),
VersionName: jobResult.VersionName,
ResourcePools: form.PoolID,
EngineVersions: form.EngineID,
FlavorInfos: form.Flavor,
TrainUrl: outputObsPath,
BootFile: form.BootFile,
Uuid: form.Attachment,
DatasetName: attach.Name,
Params: form.Params,
BranchName: branch_name,
})

if err != nil {
log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")
// var fatherVersionName = ctx.Query("versionName")
// jobID = "19373"
// versionName = "V0009"

jobName := form.JobName
uuid := form.Attachment
description := form.Description
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := form.BootFile
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
fatherVersionName := form.VersionName

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}

attach, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
return
}

//todo: del the codeLocalPath
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)
if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name,
}); err != nil {
log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)

ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
// ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form)
return
}

//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
return
}

//todo: del local code?

var parameters models.Parameters
param := make([]models.Parameter, 0)
param = append(param, models.Parameter{
Label: modelarts.TrainUrl,
Value: outputObsPath,
}, models.Parameter{
Label: modelarts.DataUrl,
Value: dataPath,
})
if len(params) != 0 {
err := json.Unmarshal([]byte(params), &parameters)
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
return
}

for _, parameter := range parameters.Parameter {
if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
param = append(param, models.Parameter{
Label: parameter.Label,
Value: parameter.Value,
})
}
}
}

//save param config
if isSaveParam == "on" {
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
return
}

_, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
ConfigName: form.ParameterTemplateName,
Description: form.PrameterDescription,
DataUrl: dataPath,
AppUrl: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
TrainUrl: outputObsPath,
Flavor: models.Flavor{
Code: flavorCode,
},
WorkServerNum: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Parameter: parameters.Parameter,
})

if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
}
// JobVersionName := "V0001"
// PreVersionId := int64(67646)
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, fatherVersionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
req := &modelarts.GenerateTrainJobVersionReq{
JobName: task.JobName,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: codeObsPath + bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
PreVersionId: task.VersionID,
CommitID: commitID,
}
jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
// 保存openi创建训练任务界面的参数
err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

JobName: req.JobName,
JobID: strconv.FormatInt(jobResult.JobID, 10),
VersionName: jobResult.VersionName,
ResourcePools: form.PoolID,
EngineVersions: form.EngineID,
FlavorInfos: form.Flavor,
TrainUrl: outputObsPath,
BootFile: form.BootFile,
Uuid: form.Attachment,
DatasetName: attach.Name,
Params: form.Params,
BranchName: branch_name,
})

if err != nil {
log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
// ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

// readDir reads the directory named by dirname and returns
// a list of directory entries sorted by filename.
func readDir(dirname string) ([]os.FileInfo, error) {
@@ -880,6 +1202,27 @@ func TrainJobShow(ctx *context.Context) {

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
if page <= 0 {
page = 1
}
VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
JobID: jobID,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return
}

if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
@@ -930,6 +1273,8 @@ func TrainJobShow(ctx *context.Context) {
ctx.Data["task"] = task
ctx.Data["jobID"] = jobID
ctx.Data["result"] = result
ctx.Data["VersionListTasks"] = VersionListTasks
ctx.Data["VersionLisCount"] = VersionListCount
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

@@ -1040,6 +1385,52 @@ func TrainJobStop(ctx *context.Context) {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func TrainJobVersionDel(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Params(":versionName")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

_, err = modelarts.DelTrainJob(jobID)
if err != nil {
log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

err = models.DeleteJob(task)
if err != nil {
ctx.ServerError("DeleteJob failed", err)
return
}

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func TrainJobVersionStop(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Params(":versionName")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

_, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func canUserCreateTrainJob(uid int64) (bool, error) {
org, err := models.GetOrgByName(setting.AllowedOrg)
if err != nil {


+ 8
- 10
routers/routes/routes.go View File

@@ -975,16 +975,6 @@ func RegisterRoutes(m *macaron.Macaron) {
}, context.RepoRef())

m.Group("/modelarts", func() {
// m.Get("", reqRepoCloudBrainReader, repo.ModelArtsIndex)
// m.Group("/:jobid", func() {
// m.Get("", reqRepoCloudBrainReader, repo.ModelArtsShow)
// m.Get("/debug", reqRepoCloudBrainReader, repo.ModelArtsDebug)
// m.Post("/stop", reqRepoCloudBrainWriter, repo.ModelArtsStop)
// m.Post("/del", reqRepoCloudBrainWriter, repo.ModelArtsDel)
// })
// m.Get("/create", reqRepoCloudBrainWriter, repo.ModelArtsNew)
// m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsForm{}), repo.ModelArtsCreate)

m.Group("/notebook", func() {
m.Get("", reqRepoCloudBrainReader, repo.NotebookIndex)
m.Group("/:jobid", func() {
@@ -1006,9 +996,17 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog)
m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels)
m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel)
m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion)
m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)
m.Post("/stop_version", reqRepoCloudBrainWriter, repo.TrainJobVersionStop)
m.Post("/del_version", reqRepoCloudBrainWriter, repo.TrainJobVersionDel)
})
m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew)
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)

// m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNewVersion)
// m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)

m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList)
})
}, context.RepoRef())


+ 0
- 0
templates/repo/modelarts/trainjob/version_new.tmpl View File


Loading…
Cancel
Save