Browse Source

提交代码

tags/v1.22.7.1
ychao_1983 3 years ago
parent
commit
2c6f265cdd
7 changed files with 359 additions and 7 deletions
  1. +8
    -0
      models/cloudbrain.go
  2. +25
    -0
      modules/auth/cloudbrain.go
  3. +31
    -2
      modules/cloudbrain/cloudbrain.go
  4. +1
    -1
      routers/repo/ai_model_manage.go
  5. +280
    -4
      routers/repo/cloudbrain.go
  6. +4
    -0
      routers/repo/grampus.go
  7. +10
    -0
      routers/routes/routes.go

+ 8
- 0
models/cloudbrain.go View File

@@ -1742,6 +1742,14 @@ func GetBenchmarkCountByUserID(userID int64) (int, error) {
return int(count), err
}

func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...JobType) (int64, error) {
sess := x.In("JobType", jobTypes).And("status=? and type=?", JobWaiting, cloudbrainType)
if computeResource != "" {
sess.And("compute_resource=?", computeResource)
}
return sess.Count(new(Cloudbrain))
}

func GetCloudbrainNotebookCountByUserID(userID int64) (int, error) {
count, err := x.In("status", ModelArtsCreateQueue, ModelArtsCreating, ModelArtsStarting, ModelArtsReadyToStart, ModelArtsResizing, ModelArtsStartQueuing, ModelArtsRunning, ModelArtsRestarting).
And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainTwo).Count(new(Cloudbrain))


+ 25
- 0
modules/auth/cloudbrain.go View File

@@ -50,6 +50,27 @@ type EditImageCloudBrainForm struct {
Topics string `form:"topics"`
}

type CreateCloudBrainInferencForm struct {
JobName string `form:"job_name" binding:"Required"`
DisplayJobName string `form:"display_job_name" binding:"Required"`
Image string `form:"image" binding:"Required"`
Command string `form:"command" binding:"Required"`
Attachment string `form:"attachment" binding:"Required"`
JobType string `form:"job_type" binding:"Required"`
BenchmarkCategory string `form:"get_benchmark_category"`
GpuType string `form:"gpu_type"`
TrainUrl string `form:"train_url"`
TestUrl string `form:"test_url"`
Description string `form:"description"`
ResourceSpecId int `form:"resource_spec_id" binding:"Required"`
BootFile string `form:"boot_file"`
Params string `form:"run_para_list"`
BranchName string `form:"branch_name"`
ModelName string `form:"model_name" binding:"Required"`
ModelVersion string `form:"model_version" binding:"Required"`
CkptName string `form:"ckpt_name" binding:"Required"`
}

func (f *CreateCloudBrainForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
return validate(errs, ctx.Data, f, ctx.Locale)
}
@@ -61,3 +82,7 @@ func (f *CommitImageCloudBrainForm) Validate(ctx *macaron.Context, errs binding.
func (f *EditImageCloudBrainForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
return validate(errs, ctx.Data, f, ctx.Locale)
}

func (f *CreateCloudBrainInferencForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
return validate(errs, ctx.Data, f, ctx.Locale)
}

+ 31
- 2
modules/cloudbrain/cloudbrain.go View File

@@ -37,6 +37,8 @@ const (
Success = "S000"

DefaultBranchName = "master"

ResultPath = "/result"
)

var (
@@ -68,6 +70,11 @@ type GenerateCloudBrainTaskReq struct {
BenchmarkTypeID int
BenchmarkChildTypeID int
ResourceSpecId int
ResultPath string
TrainUrl string
ModelName string
ModelVersion string
CkptName string
}

func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
@@ -213,8 +220,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) {
func GenerateTask(req GenerateCloudBrainTaskReq) error {
var resourceSpec *models.ResourceSpec
var versionCount int

if req.JobType == string(models.JobTypeTrain) {
if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) {
versionCount = 1
if TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
@@ -277,6 +283,13 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: req.ResultPath,
MountPath: ResultPath,
ReadOnly: true,
},
},
}

if len(req.DatasetInfos) == 1 {
@@ -357,6 +370,11 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
BootFile: req.BootFile,
DatasetName: req.DatasetNames,
Parameters: req.Params,
TrainUrl: req.TrainUrl,
ModelName: req.ModelName,
ModelVersion: req.ModelVersion,
CkptName: req.CkptName,
ResultUrl: req.ResultPath,
CreatedUnix: createTime,
UpdatedUnix: createTime,
CommitID: req.CommitID,
@@ -377,6 +395,8 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
} else if string(models.JobTypeTrain) == req.JobType {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
} else if string(models.JobTypeInference) == req.JobType {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
} else {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
}
@@ -388,6 +408,15 @@ func IsBenchmarkJob(jobType string) bool {
return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
}

func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
if err != nil {
log.Warn("Get waiting count err", err)
num = 0
}
return num
}

func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
jobName := task.JobName



+ 1
- 1
routers/repo/ai_model_manage.go View File

@@ -766,7 +766,7 @@ func QueryModelListForPredict(ctx *context.Context) {
PageSize: -1,
},
RepoID: repoId,
Type: -1,
Type: ctx.QueryInt("type"),
New: -1,
})
if err != nil {


+ 280
- 4
routers/repo/cloudbrain.go View File

@@ -46,6 +46,9 @@ const (

tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new"
tplCloudBrainTrainJobShow base.TplName = "repo/cloudbrain/trainjob/show"

tplCloudBrainInferenceJobNew base.TplName = "repo/cloudbrain/inference/new"
tplCloudBrainInferenceJobShow base.TplName = "repo/cloudbrain/inference/show"
)

var (
@@ -200,6 +203,8 @@ func CloudBrainNew(ctx *context.Context) {
ctx.ServerError("get new cloudbrain info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainOne, "", models.JobTypeDebug)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplCloudBrainNew)
}

@@ -318,6 +323,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
BenchmarkTypeID: 0,
BenchmarkChildTypeID: 0,
ResourceSpecId: resourceSpecId,
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
}

err = cloudbrain.GenerateTask(req)
@@ -334,6 +340,138 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}

func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) {
ctx.Data["PageIsCloudBrain"] = true
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
image := strings.TrimSpace(form.Image)
uuid := form.Attachment
jobType := form.JobType
gpuQueue := form.GpuType
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
resourceSpecId := form.ResourceSpecId
branchName := form.BranchName
repo := ctx.Repo.Repository

ckptUrl := form.TrainUrl + form.CkptName

tpl := tplCloudBrainInferenceJobNew
command := cloudbrain.Command
if jobType == string(models.JobTypeTrain) {
tpl = tplCloudBrainTrainJobNew
commandTrain, err := getInferenceJobCommand(form)
if err != nil {
log.Error("getTrainJobCommand failed: %v", err)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}

command = commandTrain
}

tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}

if !jobNamePattern.MatchString(displayJobName) {
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}

if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeTrain) {
log.Error("jobtype error:", jobType, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("jobtype error", tpl, &form)
return
}

count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpl, &form)
return
}
}

if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
downloadCode(repo, codePath, branchName)
uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
resultPath := setting.JobPath + jobName + cloudbrain.ResultPath + "/"
mkResultPath(resultPath)
uploadCodeToMinio(resultPath, jobName, cloudbrain.ResultPath+"/")

commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)

datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}

req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
JobName: jobName,
Image: image,
Command: command,
Uuids: uuid,
DatasetNames: datasetNames,
DatasetInfos: datasetInfos,
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
ModelPath: ckptUrl,
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
JobType: jobType,
GpuQueue: gpuQueue,
Description: form.Description,
BranchName: branchName,
BootFile: form.BootFile,
Params: form.Params,
CommitID: commitID,
ResourceSpecId: resourceSpecId,
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
ModelName: form.ModelName,
ModelVersion: form.ModelVersion,
CkptName: form.CkptName,
TrainUrl: form.TrainUrl,
}

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")

}

func CloudBrainRestart(ctx *context.Context) {
var ID = ctx.Params(":id")
var resultCode = "0"
@@ -1181,6 +1319,20 @@ func CloudBrainDownloadModel(ctx *context.Context) {
ctx.Resp.Header().Set("Cache-Control", "max-age=0")
http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
}
func CloudBrainDownloadInferenceResult(ctx *context.Context) {
parentDir := ctx.Query("parentDir")
fileName := ctx.Query("fileName")
jobName := ctx.Query("jobName")
filePath := "jobs/" + jobName + "/result/" + parentDir
url, err := storage.Attachments.PresignedGetURL(filePath, fileName)
if err != nil {
log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
ctx.ServerError("PresignedGetURL", err)
return
}
ctx.Resp.Header().Set("Cache-Control", "max-age=0")
http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
}

func GetRate(ctx *context.Context) {
isObjectDetcionAll := ctx.QueryBool("isObjectDetcionAll")
@@ -1334,13 +1486,21 @@ func uploadCodeToMinio(codePath, jobName, parentDir string) error {
}

func mkModelPath(modelPath string) error {
err := os.MkdirAll(modelPath, os.ModePerm)
return mkPathAndReadMeFile(modelPath, "You can put the model file into this directory and download it by the web page.")
}

func mkResultPath(resultPath string) error {
return mkPathAndReadMeFile(resultPath, "You can put the result file into this directory and download it by the web page.")
}

func mkPathAndReadMeFile(path string, text string) error {
err := os.MkdirAll(path, os.ModePerm)
if err != nil {
log.Error("MkdirAll(%s) failed:%v", modelPath, err)
log.Error("MkdirAll(%s) failed:%v", path, err)
return err
}

fileName := modelPath + "README"
fileName := path + "README"
f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
if err != nil {
log.Error("OpenFile failed", err.Error())
@@ -1349,7 +1509,7 @@ func mkModelPath(modelPath string) error {

defer f.Close()

_, err = f.WriteString("You can put the model file into this directory and download it by the web page.")
_, err = f.WriteString(text)
if err != nil {
log.Error("WriteString failed", err.Error())
return err
@@ -1802,6 +1962,8 @@ func CloudBrainBenchmarkNew(ctx *context.Context) {
ctx.ServerError("get new cloudbrain info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainOne, "", models.JobTypeBrainScore, models.JobTypeSnn4imagenet, models.JobTypeBenchmark)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplCloudBrainBenchmarkNew)
}

@@ -2068,6 +2230,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
BenchmarkTypeID: benchmarkTypeID,
BenchmarkChildTypeID: benchmarkChildTypeID,
ResourceSpecId: resourceSpecId,
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
}

err = cloudbrain.GenerateTask(req)
@@ -2196,6 +2359,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
BenchmarkTypeID: 0,
BenchmarkChildTypeID: benchmarkChildTypeID,
ResourceSpecId: resourceSpecId,
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
}

err = cloudbrain.GenerateTask(req)
@@ -2243,9 +2407,121 @@ func CloudBrainTrainJobNew(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainOne, "", models.JobTypeTrain)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew)
}

func InferenceCloudBrainJobNew(ctx *context.Context) {
err := cloudBrainNewDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainOne, "", models.JobTypeInference)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(http.StatusOK, tplCloudBrainInferenceJobNew)
}

func InferenceCloudBrainJobShow(ctx *context.Context) {
err := cloudBrainNewDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}

var jobID = ctx.Params(":jobid")

task, err := models.GetCloudbrainByJobID(jobID)

if err != nil {
log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplCloudBrainInferenceJobShow, nil)
return
}

//将运行参数转化为epoch_size = 3, device_target = Ascend的格式
var parameters models.Parameters
err = json.Unmarshal([]byte(task.Parameters), &parameters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.RenderWithErr(err.Error(), tplCloudBrainInferenceJobShow, nil)
return
}

if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
} else {
task.Parameters = ""
}

LabelName := strings.Fields(task.LabelName)
ctx.Data["labelName"] = LabelName
ctx.Data["jobID"] = jobID
ctx.Data["jobName"] = task.JobName
ctx.Data["displayJobName"] = task.DisplayJobName
ctx.Data["task"] = task
ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)

tempUids := []int64{}
tempUids = append(tempUids, task.UserID)
JobCreater, err := models.GetUserNamesByIDs(tempUids)
if err != nil {
log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
}
ctx.Data["userName"] = JobCreater[0]

ctx.HTML(http.StatusOK, tplCloudBrainInferenceJobShow)
}

func DownloadInferenceResultFile(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
return
}

allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, task.ResultUrl)
returnFileName := task.DisplayJobName + ".zip"
MinioDownloadManyFile(task.ResultUrl, ctx, returnFileName, allFile)
}

func getInferenceJobCommand(form auth.CreateCloudBrainInferencForm) (string, error) {
var command string
bootFile := strings.TrimSpace(form.BootFile)
params := form.Params

if !strings.HasSuffix(bootFile, ".py") {
log.Error("bootFile(%s) format error", bootFile)
return command, errors.New("bootFile format error")
}

var parameters models.Parameters
var param string
if len(params) != 0 {
err := json.Unmarshal([]byte(params), &parameters)
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
return command, err
}

for _, parameter := range parameters.Parameter {
param += " --" + parameter.Label + "=" + parameter.Value
}
}

command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile

return command, nil
}

func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) {
var command string
bootFile := strings.TrimSpace(form.BootFile)


+ 4
- 0
routers/repo/grampus.go View File

@@ -43,6 +43,8 @@ func GrampusTrainJobGPUNew(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
}

@@ -53,6 +55,8 @@ func GrampusTrainJobNPUNew(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplGrampusTrainJobNPUNew)
}



+ 10
- 0
routers/routes/routes.go View File

@@ -1090,6 +1090,16 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.CloudBrainTrainJobNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate)
})
m.Group("/inference-job", func() {
m.Group("/:jobid", func() {
m.Get("", reqRepoCloudBrainReader, repo.InferenceCloudBrainJobShow)
m.Get("/result_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.CloudBrainDownloadInferenceResult)

m.Get("/downloadall", repo.DownloadInferenceResultFile)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.InferenceCloudBrainJobNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainInferencForm{}), repo.CloudBrainInferenceJobCreate)
})
}, context.RepoRef())
m.Group("/grampus", func() {
m.Group("/train-job", func() {


Loading…
Cancel
Save