diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample index c0915f2ea..8bc971f2c 100755 --- a/custom/conf/app.ini.sample +++ b/custom/conf/app.ini.sample @@ -1096,6 +1096,7 @@ LOCATION = cn-south-222 BASE_PATH = attachment/ [modelarts] +ORGANIZATION = modelarts ENDPOINT = https://modelarts.cn-south-222.ai.pcl.cn PROJECT_ID = edfccf24aace4e17a56da6bcbb55a5aa PROJECT_NAME = cn-south-222_test diff --git a/models/attachment.go b/models/attachment.go index 96594bbb3..75e937913 100755 --- a/models/attachment.go +++ b/models/attachment.go @@ -429,7 +429,7 @@ func GetAllUserAttachments(userID int64) ([]*AttachmentUsername, error) { func getModelArtsUserAttachments(e Engine, userID int64) ([]*AttachmentUsername, error) { attachments := make([]*AttachmentUsername, 0, 10) if err := e.Table("attachment").Join("LEFT", "`user`", "attachment.uploader_id "+ - "= `user`.id").Where("attachment.type = ? and (uploader_id= ? or is_private = ?)", TypeCloudBrainTwo, userID, false).Find(&attachments); err != nil { + "= `user`.id").Where("attachment.type = ? and (uploader_id= ? or is_private = ?)", TypeCloudBrainNotebook, userID, false).Find(&attachments); err != nil { return nil, err } return attachments, nil diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 11defbb21..d83e38d32 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" "time" + "xorm.io/builder" "xorm.io/xorm" @@ -47,22 +48,28 @@ const ( ) type Cloudbrain struct { - ID int64 `xorm:"pk autoincr"` - JobID string `xorm:"INDEX NOT NULL"` - JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` - JobName string `xorm:"INDEX"` - Status string `xorm:"INDEX"` - UserID int64 `xorm:"INDEX"` - RepoID int64 `xorm:"INDEX"` - SubTaskName string `xorm:"INDEX"` - ContainerID string - ContainerIp string - CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` - UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` - DeletedAt time.Time `xorm:"deleted"` - CanDebug bool `xorm:"-"` - CanDel bool `xorm:"-"` - Type int `xorm:"INDEX DEFAULT 0"` + ID int64 `xorm:"pk autoincr"` + JobID string `xorm:"INDEX NOT NULL"` + JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` + JobName string `xorm:"INDEX"` + Status string `xorm:"INDEX"` + UserID int64 `xorm:"INDEX"` + RepoID int64 `xorm:"INDEX"` + SubTaskName string `xorm:"INDEX"` + ContainerID string + ContainerIp string + CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` + UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` + Duration int `xorm:"INDEX duration"` + TrainJobDuration string + DeletedAt time.Time `xorm:"deleted"` + CanDebug bool `xorm:"-"` + CanDel bool `xorm:"-"` + Type int `xorm:"INDEX DEFAULT 0"` + + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string + Uuid string User *User `xorm:"-"` Repo *Repository `xorm:"-"` @@ -555,6 +562,260 @@ type NotebookDelResult struct { InstanceID string `json:"instance_id"` } +type CreateTrainJobParams struct { + JobName string `json:"job_name"` + Description string `json:"job_desc"` + Config Config `json:"config"` + WorkspaceID string `json:"workspace_id"` +} + +type Config struct { + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + CreateVersion bool `json:"create_version"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` +} + +type CreateConfigParams struct { + ConfigName string `json:"config_name"` + Description string `json:"config_desc"` + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + //CreateVersion bool `json:"create_version"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` +} + +type Parameter struct { + Label string `json:"label"` + Value string `json:"value"` +} + +type Parameters struct { + Parameter []Parameter `json:"parameter"` +} + +type DataSource struct { + DatasetID string `json:"dataset_id"` + DatasetVersion string `json:"dataset_version"` + Type string `json:"type"` + DataUrl string `json:"data_url"` +} + +type Volumes struct { + Nfs Nfs `json:"nfs"` + HostPath HostPath `json:"host_path"` +} + +type Nfs struct { + ID string `json:"id"` + SourcePath string `json:"src_path"` + DestPath string `json:"dest_path"` + ReadOnly bool `json:"read_only"` +} + +type HostPath struct { + SourcePath string `json:"src_path"` + DestPath string `json:"dest_path"` + ReadOnly bool `json:"read_only"` +} + +type Flavor struct { + Code string `json:"code"` +} + +type CreateTrainJobResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + JobName string `json:"job_name"` + JobID int64 `json:"job_id"` + Status int `json:"status"` + CreateTime int64 `json:"create_time"` + VersionID int64 `json:"version_id"` + ResourceID string `json:"resource_id"` + VersionName string `json:"version_name"` +} + +type CreateTrainJobConfigResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` +} + +type GetResourceSpecsResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + SpecTotalCount int `json:"spec_total_count"` + Specs []Specs `json:"specs"` +} + +type Specs struct { + Core string `json:"core"` + Cpu string `json:"cpu"` + IsNoResource bool `json:"no_resource"` + GpuType string `json:"gpu_type"` + SpecID int64 `json:"spec_id"` + GpuNum int `json:"gpu_num"` + SpecCode string `json:"spec_code"` + Storage string `json:"storage"` + MaxNum int `json:"max_num"` + UnitNum int `json:"unit_num"` + InterfaceType int `json:"interface_type"` +} + +type GetConfigListResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + ConfigTotalCount int `json:"config_total_count"` + ParaConfigs []ParaConfig `json:"configs"` +} + +type ParaConfig struct { + ConfigName string `json:"config_name"` + ConfigDesc string `json:"config_desc"` + CreateTime int64 `json:"create_time"` + EngineType int `json:"engine_type"` + EngineName string `json:"engine_name"` + EngineId int64 `json:"engine_id"` + EngineVersion string `json:"engine_version"` + UserImageUrl string `json:"user_image_url"` + UserCommand string `json:"user_command"` + Result GetConfigResult +} + +type GetConfigResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + ConfigName string `json:"config_name"` + Description string `json:"config_desc"` + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + //CreateVersion bool `json:"create_version"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` +} + +type ErrorResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_message"` + IsSuccess bool `json:"is_success"` +} + +type GetTrainJobResult struct { + IsSuccess bool `json:"is_success"` + JobName string `json:"job_name"` + JobID int64 `json:"job_id"` + Description string `json:"job_desc"` + IntStatus int `json:"status"` + Status string + LongCreateTime int64 `json:"create_time"` + CreateTime string + Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒 + TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss + VersionID int64 `json:"version_id"` + ResourceID string `json:"resource_id"` + VersionName string `json:"version_name"` + PreVersionID int64 `json:"pre_version_id"` + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + EngineName string `json:"engine_name"` + EngineVersion string `json:"engine_version"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PoolName string `json:"pool_name"` + NasMountPath string `json:"nas_mount_path"` + NasShareAddr string `json:"nas_share_addr"` + DatasetName string +} + +type GetTrainJobLogResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + Content string `json:"content"` + Lines int `json:"lines"` + StartLine string `json:"start_line"` + EndLine string `json:"end_line"` +} + +type GetTrainJobLogFileNamesResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + LogFileList []string `json:"log_file_list"` +} + +type TrainJobResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` +} + +type LogFile struct { + Name string +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { sess := x.NewSession() defer sess.Close() @@ -672,6 +933,12 @@ func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err erro return } +func SetTrainJobStatusByJobID(jobID string, status string, duration int, trainjobduration string) (err error) { + cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration} + _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb) + return +} + func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } @@ -683,6 +950,17 @@ func updateJob(e Engine, job *Cloudbrain) error { return err } +// func UpdateTrainJob(job *CloudbrainInfo) error { +// return updateTrainJob(x, job) +// } + +// func updateTrainJob(e Engine, job *CloudbrainInfo) error { +// var sess *xorm.Session +// sess = e.Where("job_id = ?", job.Cloudbrain.JobID) +// _, err := sess.Cols("status", "container_id", "container_ip").Update(job) +// return err +// } + func DeleteJob(job *Cloudbrain) error { return deleteJob(x, job) } @@ -698,7 +976,7 @@ func GetCloudbrainByName(jobName string) (*Cloudbrain, error) { } func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool { - if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)){ + if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) { return false } repo, err := GetRepositoryByID(job.RepoID) diff --git a/models/file_chunk.go b/models/file_chunk.go index b849f0108..8decb7b44 100755 --- a/models/file_chunk.go +++ b/models/file_chunk.go @@ -14,7 +14,10 @@ const ( ) const ( - TypeCloudBrainOne = 0 + TypeCloudBrainOne = 0 + TypeCloudBrainNotebook = 1 + TypeCloudBrainTrainJob = 2 + TypeCloudBrainTwo = 1 ) diff --git a/models/repo.go b/models/repo.go index dd36bf71d..c8629875e 100755 --- a/models/repo.go +++ b/models/repo.go @@ -6,13 +6,14 @@ package models import ( - "code.gitea.io/gitea/modules/blockchain" "context" "crypto/md5" "errors" "fmt" "html/template" + "code.gitea.io/gitea/modules/blockchain" + // Needed for jpeg support _ "image/jpeg" "image/png" @@ -171,11 +172,11 @@ type Repository struct { NumOpenIssues int `xorm:"-"` NumPulls int NumClosedPulls int - NumOpenPulls int `xorm:"-"` - NumMilestones int `xorm:"NOT NULL DEFAULT 0"` - NumClosedMilestones int `xorm:"NOT NULL DEFAULT 0"` - NumOpenMilestones int `xorm:"-"` - NumCommit int64 `xorm:"NOT NULL DEFAULT 0"` + NumOpenPulls int `xorm:"-"` + NumMilestones int `xorm:"NOT NULL DEFAULT 0"` + NumClosedMilestones int `xorm:"NOT NULL DEFAULT 0"` + NumOpenMilestones int `xorm:"-"` + NumCommit int64 `xorm:"NOT NULL DEFAULT 0"` IsPrivate bool `xorm:"INDEX"` IsEmpty bool `xorm:"INDEX"` @@ -215,8 +216,8 @@ type Repository struct { CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` - Hot int64 `xorm:"-"` - Active int64 `xorm:"-"` + Hot int64 `xorm:"-"` + Active int64 `xorm:"-"` } // SanitizedOriginalURL returns a sanitized OriginalURL @@ -2473,7 +2474,7 @@ func (repo *Repository) IncreaseCloneCnt() { } func UpdateRepositoryCommitNum(repo *Repository) error { - if _,err := x.Exec("UPDATE `repository` SET num_commit = ? where id = ?", repo.NumCommit, repo.ID); err != nil { + if _, err := x.Exec("UPDATE `repository` SET num_commit = ? where id = ?", repo.NumCommit, repo.ID); err != nil { return err } diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index 0be3e3882..f2e5aeed5 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -14,3 +14,32 @@ type CreateModelArtsForm struct { func (f *CreateModelArtsForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { return validate(errs, ctx.Data, f, ctx.Locale) } + +type CreateModelArtsNotebookForm struct { + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment"` + Description string `form:"description"` +} + +func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { + return validate(errs, ctx.Data, f, ctx.Locale) +} + +type CreateModelArtsTrainJobForm struct { + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment" binding:"Required"` + BootFile string `form:"boot_file" binding:"Required"` + WorkServerNumber int `form:"work_server_number" binding:"Required"` + EngineID int `form:"engine_id" binding:"Required"` + PoolID string `form:"pool_id" binding:"Required"` + Flavor string `form:"flavor" binding:"Required"` + Params string `form:"run_para_list" binding:"Required"` + Description string `form:"description"` + IsSaveParam string `form:"is_save_para"` + ParameterTemplateName string `form:"parameter_template_name"` + PrameterDescription string `form:"parameter_description"` +} + +func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { + return validate(errs, ctx.Data, f, ctx.Locale) +} diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 483292d2f..8f6bf4e17 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -131,7 +131,8 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, JobName: jobName, SubTaskName: SubTaskName, JobType: jobType, - Type: models.TypeCloudBrainOne, + Type: models.TypeCloudBrainOne, + Uuid: uuid, }) if err != nil { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index edd9d5d6b..63baa910f 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -1,22 +1,53 @@ package modelarts import ( + "encoding/json" + "path" + "strconv" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/storage" - "encoding/json" - "path" ) const ( + //notebook storageTypeOBS = "obs" autoStopDuration = 4 * 60 * 60 DataSetMountPath = "/home/ma-user/work" NotebookEnv = "Python3" NotebookType = "Ascend" + FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + + //train-job + // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" + // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" + // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + + // "]}" + // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + + // "]}" + CodePath = "/code/" + OutputPath = "/output/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 20 + TrainUrl = "train_url" + DataUrl = "data_url" + PerPage = 10 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" ) var ( @@ -24,6 +55,50 @@ var ( FlavorInfos *models.FlavorInfos ) +type GenerateTrainJobReq struct { + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter +} + +type VersionInfo struct { + Version []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"version"` +} + +type Flavor struct { + Info []struct { + Code string `json:"code"` + Value string `json:"value"` + } `json:"flavor"` +} + +type Engine struct { + Info []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"engine"` +} + +type ResourcePool struct { + Info []struct { + ID string `json:"id"` + Value string `json:"value"` + } `json:"resource_pool"` +} + func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { var dataActualPath string if uuid != "" { @@ -76,20 +151,121 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error log.Error("CreateJob failed: %v", err.Error()) return err } - err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: string(models.JobWaiting), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, JobType: string(models.JobTypeDebug), - Type: models.TypeCloudBrainTwo, + Type: models.TypeCloudBrainNotebook, + Uuid: uuid, + }) + + if err != nil { + return err + } + + return nil +} + +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { + jobResult, err := createTrainJob(models.CreateTrainJobParams{ + JobName: req.JobName, + Description: req.Description, + Config: models.Config{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFile, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + CreateVersion: true, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + }, + }) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCloudBrainTrainJob, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, }) if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return err } return nil } + +func TransTrainJobStatus(status int) string { + switch status { + case 0: + return "UNKNOWN" + case 1: + return "CREATING" + case 2: + return "IMAGE_CREATING" + case 3: + return "IMAGE_FAILED" + case 4: + return "SUBMIT_TRYING" + case 5: + return "SUBMIT_FAILED" + case 6: + return "DELETE_FAILED" + case 7: + return "WAITING" + case 8: + return "RUNNING" + case 9: + return "STOPPED" + case 10: + return "COMPLETED" + case 11: + return "FAILED" + case 12: + return "STOPPED" + case 13: + return "CANCELED" + case 14: + return "LOST" + case 15: + return "SCALING" + case 16: + return "SUBMIT_MODEL_FAILED" + case 17: + return "DEPLOY_SERVICE_FAILED" + case 18: + return "CHECK_INIT" + case 19: + return "CHECK_RUNNING" + case 20: + return "CHECK_RUNNING_COMPLETED" + case 21: + return "CHECK_FAILED" + + default: + return strconv.Itoa(status) + } + + return "" +} diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index f91be5e31..d17478c94 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -1,13 +1,14 @@ package modelarts import ( - "code.gitea.io/gitea/modules/log" "crypto/tls" "encoding/json" "fmt" "net/http" + "strconv" "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "github.com/go-resty/resty/v2" ) @@ -23,6 +24,9 @@ const ( urlGetToken = "/v3/auth/tokens" urlNotebook = "/demanager/instances" + urlTrainJob = "/training-jobs" + urlResourceSpecs = "/job/resource-specs" + urlTrainJobConfig = "/training-job-configs" errorCodeExceedLimit = "ModelArts.0118" ) @@ -104,7 +108,7 @@ sendjob: Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) if err != nil { - return nil, fmt.Errorf("resty create job: %s", err) + return nil, fmt.Errorf("resty create notebook: %s", err) } if res.StatusCode() == http.StatusUnauthorized && retry < 1 { @@ -121,11 +125,11 @@ sendjob: } if len(response.ErrorCode) != 0 { - log.Error("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) if response.ErrorCode == errorCodeExceedLimit { response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" } - return &result, fmt.Errorf("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) } return &result, nil @@ -210,6 +214,45 @@ sendjob: return &result, nil } +func DelNotebook(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + func DelJob(jobID string) (*models.NotebookDelResult, error) { checkSetting() client := getRestyClient() @@ -287,3 +330,441 @@ sendjob: return &result, nil } + +func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create train-job: %s", err) + } + + req, _ := json.Marshal(createJobParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetResourceSpecsResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + + if err != nil { + return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(req). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + //temp, _ := json.Marshal(req) + //log.Info("%s", temp) + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigListResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "per_page": strconv.Itoa(perPage), + "page": strconv.Itoa(page), + "sortBy": sortBy, + "order": order, + "search_content": searchContent, + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty GetConfigList: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) + + if err != nil { + return result, fmt.Errorf("resty GetParaConfig: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return result, nil +} + +func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("获取作业详情失败") + } + + return &result, nil +} + +func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "base_line": baseLine, + "lines": strconv.Itoa(lines), + "log_file": logFile, + "order": order, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLog(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogFileNamesResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLogFileNames(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func DelTrainJob(jobID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop") + + if err != nil { + return &result, fmt.Errorf("resty StopTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("StopTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 221933c6f..eb0bab836 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -467,13 +467,16 @@ var ( CommitValidDate string //obs config - Endpoint string - AccessKeyID string - SecretAccessKey string - Bucket string - Location string - BasePath string - UserBasePath string + Endpoint string + AccessKeyID string + SecretAccessKey string + Bucket string + Location string + BasePath string + OutPutPath string + TrainJobModelPath string + CodePathPrefix string + UserBasePath string //modelarts config ModelArtsHost string @@ -483,10 +486,16 @@ var ( ModelArtsUsername string ModelArtsPassword string ModelArtsDomain string + AllowedOrg string ProfileID string PoolInfos string Flavor string - FlavorInfos string + //train-job + ResourcePools string + Engines string + EngineVersions string + FlavorInfos string + TrainJobFLAVORINFOS string //elk config ElkUrl string @@ -1246,6 +1255,9 @@ func NewContext() { Bucket = sec.Key("BUCKET").MustString("testopendata") Location = sec.Key("LOCATION").MustString("cn-south-222") BasePath = sec.Key("BASE_PATH").MustString("attachment/") + TrainJobModelPath = sec.Key("TrainJobModel_Path").MustString("job/") + OutPutPath = sec.Key("Output_Path").MustString("output/") + CodePathPrefix = sec.Key("CODE_PATH_PREFIX").MustString("code/") UserBasePath = sec.Key("BASE_PATH_USER").MustString("users/") PROXYURL = sec.Key("PROXY_URL").MustString("") @@ -1257,10 +1269,15 @@ func NewContext() { ModelArtsUsername = sec.Key("USERNAME").MustString("") ModelArtsPassword = sec.Key("PASSWORD").MustString("") ModelArtsDomain = sec.Key("DOMAIN").MustString("cn-south-222") + AllowedOrg = sec.Key("ORGANIZATION").MustString("") ProfileID = sec.Key("PROFILE_ID").MustString("") PoolInfos = sec.Key("POOL_INFOS").MustString("") - Flavor = sec.Key("FLAVOR").MustString("") + Flavor = sec.Key("FLAVOR").MustString("modelarts.bm.910.arm.public.2") + ResourcePools = sec.Key("Resource_Pools").MustString("") + Engines = sec.Key("Engines").MustString("") + EngineVersions = sec.Key("Engine_Versions").MustString("") FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") + TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("") sec = Cfg.Section("elk") ElkUrl = sec.Key("ELKURL").MustString("http://192.168.207.35:5601/internal/bsearch") diff --git a/modules/storage/obs.go b/modules/storage/obs.go index d5817fd3f..e5e02952f 100755 --- a/modules/storage/obs.go +++ b/modules/storage/obs.go @@ -134,6 +134,7 @@ func ObsDownload(uuid string, fileName string) (io.ReadCloser, error) { input := &obs.GetObjectInput{} input.Bucket = setting.Bucket input.Key = strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid, fileName)), "/") + // input.Key = strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid)), "/") output, err := ObsCli.GetObject(input) if err == nil { log.Info("StorageClass:%s, ETag:%s, ContentType:%s, ContentLength:%d, LastModified:%s\n", @@ -148,6 +149,56 @@ func ObsDownload(uuid string, fileName string) (io.ReadCloser, error) { } } +func ObsModelDownload(JobName string, fileName string) (io.ReadCloser, error) { + input := &obs.GetObjectInput{} + input.Bucket = setting.Bucket + input.Key = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, JobName, setting.OutPutPath, fileName), "/") + // input.Key = strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid)), "/") + output, err := ObsCli.GetObject(input) + if err == nil { + log.Info("StorageClass:%s, ETag:%s, ContentType:%s, ContentLength:%d, LastModified:%s\n", + output.StorageClass, output.ETag, output.ContentType, output.ContentLength, output.LastModified) + return output.Body, nil + } else if obsError, ok := err.(obs.ObsError); ok { + fmt.Printf("Code:%s\n", obsError.Code) + fmt.Printf("Message:%s\n", obsError.Message) + return nil, obsError + } else { + return nil, err + } +} + +func GetObsListObject(jobName string) ([]string, error) { + // jobName = "liuzx202110271830856" + input := &obs.ListObjectsInput{} + input.Bucket = setting.Bucket + input.Prefix = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, jobName, setting.OutPutPath), "/") + log.Info("input.Prefix:", input.Prefix) + output, err := ObsCli.ListObjects(input) + log.Info("output.Prefix:", output) + ModelListArr := make([]string, 0) + if err == nil { + fmt.Printf("RequestId:%s\n", output.RequestId) + for index, val := range output.Contents { + fmt.Printf("Content[%d]-OwnerId:%s, ETag:%s, Key:%s, LastModified:%s, Size:%d\n", + index, val.Owner.ID, val.ETag, val.Key, val.LastModified, val.Size) + str1 := strings.Split(val.Key, "/") + ModelList := str1[len(str1)-1] + ModelListArr = append(ModelListArr, ModelList) + log.Info("ModelListArr.Prefix:", ModelListArr) + } + return ModelListArr, err + } else { + if obsError, ok := err.(obs.ObsError); ok { + fmt.Println(obsError.Code) + fmt.Println(obsError.Message) + } else { + fmt.Println(err) + } + return nil, err + } +} + func ObsGenMultiPartSignedUrl(uuid string, uploadId string, partNumber int, fileName string) (string, error) { input := &obs.CreateSignedUrlInput{} @@ -171,6 +222,29 @@ func ObsGenMultiPartSignedUrl(uuid string, uploadId string, partNumber int, file return output.SignedUrl, nil } +func GetObsCreateSignedUrl(uuid string, uploadId string, partNumber int, fileName string) (string, error) { + + input := &obs.CreateSignedUrlInput{} + input.Bucket = setting.Bucket + input.Key = strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid, fileName)), "/") + input.Expires = 60 * 60 + input.Method = obs.HttpMethodPut + + input.QueryParams = map[string]string{ + "partNumber": com.ToStr(partNumber, 10), + "uploadId": uploadId, + //"partSize": com.ToStr(partSize,10), + } + + output, err := ObsCli.CreateSignedUrl(input) + if err != nil { + log.Error("CreateSignedUrl failed:", err.Error()) + return "", err + } + + return output.SignedUrl, nil +} + func ObsGetPreSignedUrl(uuid, fileName string) (string, error) { input := &obs.CreateSignedUrlInput{} input.Method = obs.HttpMethodGet diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 746e46463..18ca21925 100644 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -774,6 +774,53 @@ cloudbrain_creator=Creator cloudbrain_task = Task Name cloudbrain_operate = Operate cloudbrain_status_createtime = Status/Createtime +cloudbrain_status_runtime = Running Time + + +modelarts.notebook=Debug Job +modelarts.train_job=Create Job +modelarts.train_job.new=New Job +modelarts.train_job.basic_info=Basic Info +modelarts.train_job.job_status=Job Status +modelarts.train_job.job_name=Job Name +modelarts.train_job.version=Job Version +modelarts.train_job.start_time=Start Time +modelarts.train_job.dura_time=Duration Time +modelarts.train_job.description=Description +modelarts.train_job.parameter_setting=Parameter setting +modelarts.train_job.parameter_setting_info=Parameter Info +modelarts.train_job.fast_parameter_setting=fast_parameter_setting +modelarts.train_job.fast_parameter_setting_config=fast_parameter_setting_config +modelarts.train_job.fast_parameter_setting_config_link=fast_parameter_setting_config_link +modelarts.train_job.frames=frames +modelarts.train_job.algorithm_origin=算法来源 +modelarts.train_job.AI_driver=AI Engine +modelarts.train_job.start_file=启动文件 +modelarts.train_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。 +modelarts.train_job.dataset=Dataset +modelarts.train_job.run_parameter=Run Parameter +modelarts.train_job.add_run_parameter=Add Run Parameter +modelarts.train_job.parameter_name=Parameter Name +modelarts.train_job.parameter_value=Parameter Value +modelarts.train_job.resource_setting=resource_setting +modelarts.train_job.resource_setting_info=resource_setting_info +modelarts.train_job.resource_pool=resource_pool +modelarts.train_job.resource_type=resource_type +modelarts.train_job.standard=Standard +modelarts.train_job.NAS_address=NAS地址 +modelarts.train_job.NAS_mount_path=NAS挂载路径 +modelarts.train_job.query_whether_save_parameter=query_whether_save_parameter +modelarts.train_job.save_helper=save_helper +modelarts.train_job.common_frame=common_frame +modelarts.train_job.amount_of_compute_node=Amount of Compute Node +modelarts.train_job.job_parameter_name=job_parameter_name +modelarts.train_job.parameter_description=parameter_description +modelarts.log=Log +modelarts.version_manage=Version Manage +modelarts.back=Back +modelarts.train_job_para_admin=train_job_para_admin +modelarts.train_job_para.edit=train_job_para.edit +modelarts.train_job_para.connfirm=train_job_para.connfirm template.items = Template Items template.git_content = Git Content (Default Branch) diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 32c27afe1..8ac895ec9 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -776,6 +776,7 @@ cloudbrain_creator=创建者 cloudbrain_task=任务名称 cloudbrain_operate=操作 cloudbrain_status_createtime=状态/创建时间 +cloudbrain_status_runtime = 运行时长 cloudbrain_jobname_err=只能以小写字母或数字开头且只包含小写字母、数字、_和-,不能以_结尾,最长36个字符。 modelarts.notebook=调试作业 diff --git a/public/self/test.js b/public/self/test.js new file mode 100644 index 000000000..2839c76ab --- /dev/null +++ b/public/self/test.js @@ -0,0 +1,28 @@ + +function displayDir(uuid){ + console.log('uuid 1=' + uuid); + + var html="\ + \ + \ + 数据集名称\ + 数据集类型\ + 数据集描述\ + 数据集创建者\ + "; + + for (var i=0;i<1;i++){ + var row = "\ + \ + "+uuid+"\ + " + uuid +"\ + 测试\ + 测试\ + 测试\ + "; + html=html+row; + } + + document.getElementById('dataset-files-table').innerHTML=html; + console.log('uuid 2=' + uuid); +} diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index c6f3ee0ac..9dd773c4d 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -853,7 +853,15 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/:jobid", repo.GetCloudbrainTask) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/modelarts", func() { - m.Get("/:jobid", repo.GetModelArtsTask) + m.Group("/notebook", func() { + m.Get("/:jobid", repo.GetModelArtsNotebook) + }) + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", repo.GetModelArtsTrainJob) + m.Get("/log", repo.TrainJobGetLog) + }) + }) }, reqRepoReader(models.UnitTypeCloudBrain)) }, repoAssignment()) }) diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 52b83b446..1ec4fa919 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -11,9 +11,10 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/modelarts" "net/http" + "strconv" ) -func GetModelArtsTask(ctx *context.APIContext) { +func GetModelArtsNotebook(ctx *context.APIContext) { var ( err error ) @@ -43,3 +44,81 @@ func GetModelArtsTask(ctx *context.APIContext) { }) } + +func GetModelArtsTrainJob(ctx *context.APIContext) { + var ( + err error + ) + + jobID := ctx.Params(":jobid") + repoID := ctx.Repo.Repository.ID + job, err := models.GetRepoCloudBrainByJobID(repoID, jobID) + if err != nil { + ctx.NotFound(err) + return + } + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) + if err != nil { + ctx.NotFound(err) + return + } + + job.Status = modelarts.TransTrainJobStatus(result.IntStatus) + err = models.UpdateJob(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "JobStatus": job.Status, + }) + +} + +func TrainJobGetLog(ctx *context.APIContext) { + var ( + err error + ) + + log.Info("test") + + var jobID = ctx.Params(":jobid") + var logFileName = ctx.Query("file_name") + var baseLine = ctx.Query("base_line") + var order = ctx.Query("order") + + if order != modelarts.OrderDesc && order != modelarts.OrderAsc { + log.Error("order(%s) check failed", order) + ctx.JSON(http.StatusBadRequest, map[string]interface{}{ + "err_msg": "order check failed", + }) + return + } + + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.JSON(http.StatusInternalServerError, map[string]interface{}{ + "err_msg": "GetCloudbrainByJobID failed", + }) + return + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines) + if err != nil { + log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) + ctx.JSON(http.StatusInternalServerError, map[string]interface{}{ + "err_msg": "GetTrainJobLog failed", + }) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "StartLine": result.StartLine, + "EndLine": result.EndLine, + "Content": result.Content, + "Lines": result.Lines, + }) +} diff --git a/routers/repo/attachment.go b/routers/repo/attachment.go index bb48c640b..b6661433e 100755 --- a/routers/repo/attachment.go +++ b/routers/repo/attachment.go @@ -1014,7 +1014,7 @@ func queryDatasets(ctx *context.Context, attachs []*models.AttachmentUsername) { } func checkTypeCloudBrain(typeCloudBrain int) error { - if typeCloudBrain != models.TypeCloudBrainOne && typeCloudBrain != models.TypeCloudBrainTwo { + if typeCloudBrain != models.TypeCloudBrainOne && typeCloudBrain != models.TypeCloudBrainNotebook { log.Error("type error:", typeCloudBrain) return errors.New("type error") } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index f0bb3a36f..782a51000 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -100,15 +100,15 @@ func cutString(str string, lens int) string { func jobNamePrefixValid(s string) string { lowStr := strings.ToLower(s) re := regexp.MustCompile(`[^a-z0-9_\\-]+`) - + removeSpecial := re.ReplaceAllString(lowStr, "") - + re = regexp.MustCompile(`^[_\\-]+`) return re.ReplaceAllString(removeSpecial, "") } -func cloudBrainNewDataPrepare(ctx *context.Context) error{ +func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true t := time.Now() var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] @@ -202,7 +202,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { gpuQueue := setting.JobType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath resourceSpecId := form.ResourceSpecId - + if !jobNamePattern.MatchString(jobName) { ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplModelArtsNew, &form) return @@ -474,7 +474,7 @@ func CloudBrainDel(ctx *context.Context) { return } - if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed){ + if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) { log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped")) return @@ -580,10 +580,77 @@ func CloudBrainDownloadModel(ctx *context.Context) { ctx.ServerError("PresignedGetURL", err) return } - http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently) } +// func TrainJobloadModel(ctx *context.Context) { +// parentDir := ctx.Query("parentDir") +// fileName := ctx.Query("fileName") +// jobName := ctx.Query("jobName") +// filePath := "jobs/" + jobName + "/model/" + parentDir +// url, err := storage.Attachments.PresignedGetURL(filePath, fileName) +// if err != nil { +// log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"]) +// ctx.ServerError("PresignedGetURL", err) +// return +// } + +// http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently) +// } + +func TrainJobListModel(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + jobID := ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("no such job!", ctx.Data["err"]) + ctx.ServerError("no such job:", err) + return + } + TrainJobListModel, err := storage.GetObsListObject(task.JobName) + log.Info("TrainJobListModel", TrainJobListModel) + fmt.Println("TrainJobListModel:", TrainJobListModel) + if err != nil { + log.Info("get TrainJobListModel failed:", err) + return + } + ctx.Data["task"] = task + ctx.Data["JobID"] = jobID + ctx.Data["ListModel"] = TrainJobListModel + ctx.HTML(200, tplModelArtsTrainJobListModel) +} + +func TrainJobDownloadModel(ctx *context.Context) { + JobName := ctx.Query("JobName") + fileName := ctx.Query("file_name") + + // JobName = "liuzx202110271830856" + // fileName = "Untitled.ipynb" + + body, err := storage.ObsModelDownload(JobName, fileName) + if err != nil { + log.Info("download error.") + } else { + defer body.Close() + ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName) + ctx.Resp.Header().Set("Content-Type", "application/octet-stream") + p := make([]byte, 1024) + var readErr error + var readCount int + // 读取对象内容 + for { + readCount, readErr = body.Read(p) + if readCount > 0 { + ctx.Resp.Write(p[:readCount]) + //fmt.Printf("%s", p[:readCount]) + } + if readErr != nil { + break + } + } + } +} + func GetRate(ctx *context.Context) { var jobID = ctx.Params(":jobid") job, err := models.GetCloudbrainByJobID(jobID) @@ -609,9 +676,9 @@ func downloadCode(repo *models.Repository, codePath string) error { return err } - configFile, err := os.OpenFile(codePath + "/.git/config", os.O_RDWR, 0666) + configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666) if err != nil { - log.Error("open file(%s) failed:%v", codePath + "/,git/config", err) + log.Error("open file(%s) failed:%v", codePath+"/,git/config", err) return err } @@ -631,10 +698,10 @@ func downloadCode(repo *models.Repository, codePath string) error { } } - if strings.Contains(line, "url") && strings.Contains(line, ".git"){ + if strings.Contains(line, "url") && strings.Contains(line, ".git") { originUrl := "\turl = " + repo.CloneLink().HTTPS + "\n" if len(line) > len(originUrl) { - originUrl += strings.Repeat( " ", len(line) - len(originUrl)) + originUrl += strings.Repeat(" ", len(line)-len(originUrl)) } bytes := []byte(originUrl) _, err := configFile.WriteAt(bytes, pos) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 6ae97e3e2..3203bc872 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1,14 +1,23 @@ package repo import ( - "code.gitea.io/gitea/modules/modelarts" "encoding/json" "errors" - "github.com/unknwon/com" + "fmt" + "io" + "net/http" + "os" + "path" "strconv" "strings" "time" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/obs" + "code.gitea.io/gitea/modules/storage" + "github.com/unknwon/com" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/auth" "code.gitea.io/gitea/modules/base" @@ -18,9 +27,19 @@ import ( ) const ( + // tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index" + tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index" + tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new" + tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show" + tplModelArtsIndex base.TplName = "repo/modelarts/index" tplModelArtsNew base.TplName = "repo/modelarts/new" tplModelArtsShow base.TplName = "repo/modelarts/show" + + tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index" + tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new" + tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show" + tplModelArtsTrainJobListModel base.TplName = "repo/modelarts/trainjob/list_model" ) // MustEnableDataset check if repository enable internal cb @@ -30,6 +49,7 @@ func MustEnableModelArts(ctx *context.Context) { return } } + func ModelArtsIndex(ctx *context.Context) { MustEnableModelArts(ctx) repo := ctx.Repo.Repository @@ -231,7 +251,7 @@ func ModelArtsDel(ctx *context.Context) { return } - if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped){ + if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) { log.Error("the job(%s) has not been stopped", task.JobName) ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped")) return @@ -252,3 +272,826 @@ func ModelArtsDel(ctx *context.Context) { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts") } + +func NotebookIndex(ctx *context.Context) { + MustEnableModelArts(ctx) + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + + ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainNotebook, + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return + } + + for i, task := range ciTasks { + if task.Status == string(models.JobRunning) { + ciTasks[i].CanDebug = true + } else { + ciTasks[i].CanDebug = false + } + } + + pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) + pager.SetDefaultParams(ctx) + ctx.Data["Page"] = pager + + ctx.Data["PageIsCloudBrain"] = true + ctx.Data["Tasks"] = ciTasks + ctx.HTML(200, tplModelArtsNotebookIndex) +} + +func NotebookNew(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + t := time.Now() + var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["job_name"] = jobName + + attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return + } + + ctx.Data["attachments"] = attachs + ctx.Data["dataset_path"] = modelarts.DataSetMountPath + ctx.Data["env"] = modelarts.NotebookEnv + ctx.Data["notebook_type"] = modelarts.NotebookType + if modelarts.FlavorInfos == nil { + json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) + } + ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo + + ctx.HTML(200, tplModelArtsNotebookNew) +} + +func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { + ctx.Data["PageIsNotebook"] = true + jobName := form.JobName + uuid := form.Attachment + description := form.Description + + err := modelarts.GenerateTask(ctx, jobName, uuid, description) + if err != nil { + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook") +} + +func NotebookShow(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + ctx.Data["error"] = err.Error() + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) + return + } + + result, err := modelarts.GetJob(jobID) + if err != nil { + ctx.Data["error"] = err.Error() + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) + return + } + + if result != nil { + task.Status = result.Status + err = models.UpdateJob(task) + if err != nil { + ctx.Data["error"] = err.Error() + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) + return + } + + createTime, _ := com.StrTo(result.CreationTimestamp).Int64() + result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05") + endTime, _ := com.StrTo(result.LatestUpdateTimestamp).Int64() + result.LatestUpdateTime = time.Unix(int64(endTime/1000), 0).Format("2006-01-02 15:04:05") + result.QueuingInfo.BeginTime = time.Unix(int64(result.QueuingInfo.BeginTimestamp/1000), 0).Format("2006-01-02 15:04:05") + result.QueuingInfo.EndTime = time.Unix(int64(result.QueuingInfo.EndTimestamp/1000), 0).Format("2006-01-02 15:04:05") + } + + ctx.Data["task"] = task + ctx.Data["jobID"] = jobID + ctx.Data["result"] = result + ctx.HTML(200, tplModelArtsNotebookShow) +} + +func NotebookDebug(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + _, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + ctx.ServerError("GetCloudbrainByJobID failed", err) + return + } + + result, err := modelarts.GetJob(jobID) + if err != nil { + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) + return + } + + res, err := modelarts.GetJobToken(jobID) + if err != nil { + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) + return + } + + urls := strings.Split(result.Spec.Annotations.Url, "/") + urlPrefix := result.Spec.Annotations.TargetDomain + for i, url := range urls { + if i > 2 { + urlPrefix += "/" + url + } + } + + debugUrl := urlPrefix + "?token=" + res.Token + ctx.Redirect(debugUrl) +} + +func NotebookStop(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + log.Info(jobID) + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + ctx.ServerError("GetCloudbrainByJobID failed", err) + return + } + + if task.Status != string(models.JobRunning) { + log.Error("the job(%s) is not running", task.JobName) + ctx.ServerError("the job is not running", errors.New("the job is not running")) + return + } + + param := models.NotebookAction{ + Action: models.ActionStop, + } + res, err := modelarts.StopJob(jobID, param) + if err != nil { + log.Error("StopJob(%s) failed:%v", task.JobName, err.Error()) + ctx.ServerError("StopJob failed", err) + return + } + + task.Status = res.CurrentStatus + err = models.UpdateJob(task) + if err != nil { + ctx.ServerError("UpdateJob failed", err) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook") +} + +func NotebookDel(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + ctx.ServerError("GetCloudbrainByJobID failed", err) + return + } + + if task.Status != string(models.JobStopped) { + log.Error("the job(%s) has not been stopped", task.JobName) + ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped")) + return + } + + _, err = modelarts.DelNotebook(jobID) + if err != nil { + log.Error("DelJob(%s) failed:%v", task.JobName, err.Error()) + ctx.ServerError("DelJob failed", err) + return + } + + err = models.DeleteJob(task) + if err != nil { + ctx.ServerError("DeleteJob failed", err) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook") +} + +func TrainJobIndex(ctx *context.Context) { + MustEnableModelArts(ctx) + + can, err := canUserCreateTrainJob(ctx.User.ID) + if err != nil { + ctx.ServerError("canUserCreateTrainJob", err) + return + } + + ctx.Data["CanCreate"] = can + + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + + tasks, _, err := models.Cloudbrains(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTrainJob, + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return + } + + for i := range tasks { + TrainJobDetail, err := modelarts.GetTrainJob(tasks[i].Cloudbrain.JobID, strconv.FormatInt(tasks[i].Cloudbrain.VersionID, 10)) + if TrainJobDetail != nil { + TrainJobDetail.CreateTime = time.Unix(int64(TrainJobDetail.LongCreateTime/1000), 0).Format("2006-01-02 15:04:05") + if TrainJobDetail.Duration != 0 { + TrainJobDetail.TrainJobDuration = addZero(TrainJobDetail.Duration/3600000) + ":" + addZero(TrainJobDetail.Duration%3600000/60000) + ":" + addZero(TrainJobDetail.Duration%60000/1000) + + } else { + TrainJobDetail.TrainJobDuration = "00:00:00" + } + } + if err != nil { + log.Error("GetJob(%s) failed:%v", tasks[i].Cloudbrain.JobID, err.Error()) + return + } + err = models.SetTrainJobStatusByJobID(tasks[i].Cloudbrain.JobID, modelarts.TransTrainJobStatus(TrainJobDetail.IntStatus), int(TrainJobDetail.Duration), string(TrainJobDetail.TrainJobDuration)) + // err = models.UpdateJob(tasks[i].Cloudbrain) + if err != nil { + ctx.ServerError("UpdateJob failed", err) + return + } + } + + trainTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTrainJob, + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return + } + + pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) + pager.SetDefaultParams(ctx) + ctx.Data["Page"] = pager + + ctx.Data["PageIsCloudBrain"] = true + ctx.Data["Tasks"] = trainTasks + ctx.HTML(200, tplModelArtsTrainJobIndex) +} + +func TrainJobNew(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + can, err := canUserCreateTrainJob(ctx.User.ID) + if err != nil { + ctx.ServerError("canUserCreateTrainJob", err) + return + } + + if !can { + log.Error("the user can not create train-job") + ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job")) + return + } + + t := time.Now() + var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["job_name"] = jobName + + attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return + } + ctx.Data["attachments"] = attachs + + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + ctx.Data["train_url"] = outputObsPath + + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return + } + + ctx.Data["config_list"] = configList.ParaConfigs + + ctx.HTML(200, tplModelArtsTrainJobNew) +} + +func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + ctx.Data["PageIsTrainJob"] = true + jobName := form.JobName + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" + + can, err := canUserCreateTrainJob(ctx.User.ID) + if err != nil { + ctx.ServerError("canUserCreateTrainJob", err) + return + } + + if !can { + log.Error("the user can not create train-job") + ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form) + return + } + + //param check + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { + log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) + ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) + return + } + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) + return + } + + //todo: del local code? + + var parameters models.Parameters + param := make([]models.Parameter, 0) + param = append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + + //save param config + if isSaveParam == "on" { + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: param, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) + return + } + } + + req := &modelarts.GenerateTrainJobReq{ + JobName: jobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFile: codeObsPath + bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: param, + } + + err = modelarts.GenerateTrainJob(ctx, req) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + +// readDir reads the directory named by dirname and returns +// a list of directory entries sorted by filename. +func readDir(dirname string) ([]os.FileInfo, error) { + f, err := os.Open(dirname) + if err != nil { + return nil, err + } + + list, err := f.Readdir(100) + f.Close() + if err != nil { + //todo: can not upload empty folder + if err == io.EOF { + return nil, nil + } + return nil, err + } + + //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() }) + return list, nil +} + +func uploadCodeToObs(codePath, jobName, parentDir string) error { + files, err := readDir(codePath) + if err != nil { + log.Error("readDir(%s) failed: %s", codePath, err.Error()) + return err + } + + for _, file := range files { + if file.IsDir() { + input := &obs.PutObjectInput{} + input.Bucket = setting.Bucket + input.Key = parentDir + file.Name() + "/" + _, err = storage.ObsCli.PutObject(input) + if err != nil { + log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) + return err + } + + if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil { + log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error()) + return err + } + } else { + input := &obs.PutFileInput{} + input.Bucket = setting.Bucket + input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name() + input.SourceFile = codePath + file.Name() + _, err = storage.ObsCli.PutFile(input) + if err != nil { + log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error()) + return err + } + } + } + + return nil +} + +func obsMkdir(dir string) error { + input := &obs.PutObjectInput{} + input.Bucket = setting.Bucket + input.Key = dir + _, err := storage.ObsCli.PutObject(input) + if err != nil { + log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) + return err + } + + return nil +} + +func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { + if !strings.HasSuffix(form.BootFile, ".py") { + log.Error("the boot file(%s) must be a python file", form.BootFile) + return errors.New("启动文件必须是python文件") + } + + if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 { + log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber) + return errors.New("计算节点数必须在1-25之间") + } + + return nil +} + +func TrainJobShow(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + attach, err := models.GetAttachmentByUUID(task.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetJob(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + if result != nil { + result.CreateTime = time.Unix(int64(result.LongCreateTime/1000), 0).Format("2006-01-02 15:04:05") + if result.Duration != 0 { + result.TrainJobDuration = addZero(result.Duration/3600000) + ":" + addZero(result.Duration%3600000/60000) + ":" + addZero(result.Duration%60000/1000) + + } else { + result.TrainJobDuration = "00:00:00" + } + err = models.SetTrainJobStatusByJobID(jobID, modelarts.TransTrainJobStatus(result.IntStatus), int(result.Duration), string(result.TrainJobDuration)) + if err != nil { + ctx.ServerError("UpdateJob failed", err) + return + } + result.Status = modelarts.TransTrainJobStatus(result.IntStatus) + result.DatasetName = attach.Name + } + + resultLogFile, resultLog, err := trainJobGetLog(jobID) + if err != nil { + log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] + ctx.Data["log"] = resultLog + ctx.Data["task"] = task + ctx.Data["jobID"] = jobID + ctx.Data["result"] = result + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} + +func addZero(t int64) (m string) { + if t < 10 { + m = "0" + strconv.FormatInt(t, 10) + return m + } else { + return strconv.FormatInt(t, 10) + } +} + +func TrainJobGetLog(ctx *context.Context) { + ctx.Data["PageIsTrainJob"] = true + + var jobID = ctx.Params(":jobid") + var logFileName = ctx.Query("file_name") + var baseLine = ctx.Query("base_line") + var order = ctx.Query("order") + + if order != modelarts.OrderDesc && order != modelarts.OrderAsc { + log.Error("order(%s) check failed", order) + ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow) + return + } + + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines) + if err != nil { + log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + ctx.Data["log"] = result + //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} + +func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + return nil, nil, err + } + + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) + return nil, nil, err + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines) + if err != nil { + log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) + return nil, nil, err + } + + return resultLogFile, result, err +} + +func TrainJobDel(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + _, err = modelarts.DelTrainJob(jobID) + if err != nil { + log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + err = models.DeleteJob(task) + if err != nil { + ctx.ServerError("DeleteJob failed", err) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + +func TrainJobStop(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + +func canUserCreateTrainJob(uid int64) (bool, error) { + org, err := models.GetOrgByName(setting.AllowedOrg) + if err != nil { + log.Error("get allowed org failed: ", setting.AllowedOrg) + return false, err + } + + return org.IsOrgMember(uid) +} + +func TrainJobGetConfigList(ctx *context.Context) { + ctx.Data["PageIsTrainJob"] = true + + var jobID = ctx.Params(":jobid") + var logFileName = ctx.Query("file_name") + var baseLine = ctx.Query("base_line") + var order = ctx.Query("order") + + if order != modelarts.OrderDesc && order != modelarts.OrderAsc { + log.Error("order(%s) check failed", order) + ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow) + return + } + + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines) + if err != nil { + log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + ctx.Data["log"] = result + //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} + +func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { + var result models.GetConfigListResult + + list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType) + if err != nil { + log.Error("GetConfigList failed:", err) + return &result, err + } + + for _, config := range list.ParaConfigs { + paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType) + if err != nil { + log.Error("GetParaConfig failed:", err) + return &result, err + } + + config.Result = paraConfig + } + + return list, nil +} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index f6e4c9a72..90d204a82 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -962,15 +962,42 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/modelarts", func() { - m.Get("", reqRepoCloudBrainReader, repo.ModelArtsIndex) - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.ModelArtsShow) - m.Get("/debug", reqRepoCloudBrainReader, repo.ModelArtsDebug) - m.Post("/stop", reqRepoCloudBrainWriter, repo.ModelArtsStop) - m.Post("/del", reqRepoCloudBrainWriter, repo.ModelArtsDel) + // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsIndex) + // m.Group("/:jobid", func() { + // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsShow) + // m.Get("/debug", reqRepoCloudBrainReader, repo.ModelArtsDebug) + // m.Post("/stop", reqRepoCloudBrainWriter, repo.ModelArtsStop) + // m.Post("/del", reqRepoCloudBrainWriter, repo.ModelArtsDel) + // }) + // m.Get("/create", reqRepoCloudBrainWriter, repo.ModelArtsNew) + // m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsForm{}), repo.ModelArtsCreate) + + m.Group("/notebook", func() { + m.Get("", reqRepoCloudBrainReader, repo.NotebookIndex) + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) + m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) + m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) + m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) + }) + m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) + m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsNotebookForm{}), repo.NotebookCreate) + }) + + m.Group("/train-job", func() { + m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex) + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) + m.Post("/stop", reqRepoCloudBrainWriter, repo.TrainJobStop) + m.Post("/del", reqRepoCloudBrainWriter, repo.TrainJobDel) + m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) + m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobListModel) + m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) + }) + m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) + m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) + m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) - m.Get("/create", reqRepoCloudBrainWriter, repo.ModelArtsNew) - m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsForm{}), repo.ModelArtsCreate) }, context.RepoRef()) m.Group("/blockchain", func() { diff --git a/templates/repo/cloudbrain/index.tmpl b/templates/repo/cloudbrain/index.tmpl index 3d9c080db..795e35aac 100755 --- a/templates/repo/cloudbrain/index.tmpl +++ b/templates/repo/cloudbrain/index.tmpl @@ -215,7 +215,7 @@
-
+ + +
+ +
+ +
+
+ + + + 新建调试任务 +
diff --git a/templates/repo/header.tmpl b/templates/repo/header.tmpl index cf3386df2..c290de552 100755 --- a/templates/repo/header.tmpl +++ b/templates/repo/header.tmpl @@ -176,13 +176,13 @@
- +
- +
@@ -209,14 +209,16 @@ $('.ui.radio.checkbox').checkbox(); var repolink = $(".cloudbrain_link").text() + console.log(repolink) $(".ui.positive.right.icon.button").click(function(){ // 声明一个变量来接收以及获取单选框选择的情况 - var checked_radio = $("input[type='radio']:checked").val() + var checked_radio = $("input[name='CloudBrainSelect']:checked").val() + console.log(checked_radio) if(checked_radio=='0'){ window.location.href = repolink+'/cloudbrain' }else if(checked_radio=='1'){ - window.location.href = repolink+'/modelarts' + window.location.href = repolink+'/modelarts/notebook' }else{ return; } diff --git a/templates/repo/modelarts/index.tmpl b/templates/repo/modelarts/index.tmpl index 7820cb109..bcfef05da 100755 --- a/templates/repo/modelarts/index.tmpl +++ b/templates/repo/modelarts/index.tmpl @@ -202,24 +202,29 @@
-
-
-

{{.i18n.Tr "repo.cloudbrain2"}}

-
- -
-
-
- {{if .Permission.CanWrite $.UnitTypeCloudBrain}} - {{.i18n.Tr "repo.cloudbrain.new"}} {{end}} -
-
+
-

使用鹏城云脑计算资源进行调试,云脑1提供CPU / GPU资源,云脑2提供Ascend NPU资源;调试使用的数据集也需要上传到对应的环境。

- diff --git a/templates/repo/modelarts/navbar.tmpl b/templates/repo/modelarts/navbar.tmpl new file mode 100755 index 000000000..91c0675c0 --- /dev/null +++ b/templates/repo/modelarts/navbar.tmpl @@ -0,0 +1,43 @@ + + \ No newline at end of file diff --git a/templates/repo/modelarts/notebook/index.tmpl b/templates/repo/modelarts/notebook/index.tmpl new file mode 100755 index 000000000..2a4115723 --- /dev/null +++ b/templates/repo/modelarts/notebook/index.tmpl @@ -0,0 +1,498 @@ + +{{template "base/head" .}} + + + + +
+
+
+
+
+
+
+
+
+ + +
+ +
+ {{template "repo/header" .}} + +
+ + + + +
+ + +
+ + + + 新建调试任务 +
+
+ + +
+
+
+ + + + + +
+ + +
+
+
+ {{$.i18n.Tr "repo.cloudbrain_task"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_status_createtime"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_creator"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_operate"}} +
+ +
+ +
+ + + + {{range .Tasks}} +
+
+ + + + +
+ + + + {{.Status}} + + + {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
+ +
+ {{if .User.Name}} + + {{else}} + + {{end}} +
+ +
+
+ + + 调试 + +
+ {{$.CsrfTokenHtml}} + + 停止 + +
+
+ + + + +
+ {{$.CsrfTokenHtml}} + + 删除 + +
+
+ + + + +
+
+ {{end}} {{template "base/paginate" .}} +
+ +
+
+
+ +
+ +
+
+ +
+ + +
+ +
+ +
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/notebook/new.tmpl b/templates/repo/modelarts/notebook/new.tmpl new file mode 100755 index 000000000..8cfa680f7 --- /dev/null +++ b/templates/repo/modelarts/notebook/new.tmpl @@ -0,0 +1,240 @@ +{{template "base/head" .}} + + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+
+ {{template "base/alert" .}} +
+

+
+
+ {{.CsrfTokenHtml}} +

+ {{.i18n.Tr "repo.cloudbrain.new"}} +

+
+ +
+ + +
+ +
+ + + + {{range .attachments}} + + {{end}} + + +
+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+
+
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/notebook/show.tmpl b/templates/repo/modelarts/notebook/show.tmpl new file mode 100755 index 000000000..3f914b56d --- /dev/null +++ b/templates/repo/modelarts/notebook/show.tmpl @@ -0,0 +1,122 @@ +{{template "base/head" .}} +
+{{template "repo/header" .}} +
+
+ {{template "base/alert" .}} + +

+ 返回 +

+
+
+ {{with .task}} +

任务名称: {{.JobName}}

+ {{end}} +
+
+

任务结果:

+ {{with .result}} + + + + + + + + + + + + + + + +
状态 {{.Status}}
开始时间 {{.CreateTime}}
最后更新时间 {{.LatestUpdateTime}}
+ {{end}} +
+
+ {{with .result}} + + + + + + + + + + + + + + +
配置信息
开发环境类型 {{.Profile.DeType}}
硬件类型 {{.Profile.FlavorType}}
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
机器规格详情
机器规格 {{.Flavor}}
规格名称 {{.FlavorDetails.Name}}
规格销售状态 {{.FlavorDetails.Status}}
排队个数 {{.FlavorDetails.QueuingNum}}
排到队的剩余时间(秒) {{.FlavorDetails.QueueLeftTime}}
自动停止时间(秒) {{.FlavorDetails.Duration}}
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
排队信息
实例状态 {{.QueuingInfo.Status}}
实例排队的开始时间 {{.QueuingInfo.BeginTime}}
排到队的剩余时间(秒) {{.QueuingInfo.RemainTime}}
实例排队的预计停止时间 {{.QueuingInfo.EndTime}}
实例在队列中的排位 {{.QueuingInfo.Rank}}
+ {{end}} +
+
+ +
+
+
+{{template "base/footer" .}} diff --git a/templates/repo/modelarts/trainjob/edit_para.tmpl b/templates/repo/modelarts/trainjob/edit_para.tmpl new file mode 100755 index 000000000..a38b2a4f0 --- /dev/null +++ b/templates/repo/modelarts/trainjob/edit_para.tmpl @@ -0,0 +1,239 @@ +{{template "base/head" .}} +
+
{{.i18n.Tr "loading"}}
+
+ +
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}

+
+ + +
+
+ + +
+

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}

+
+ + +
+
+ +
+
+ +
+
+ +
+
+
+
+ + + + + +
+
+
+
+ + +
+ +
+ + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ {{range .para}} +
+
+ +
+
+ +
+ + + + +
+ {{end}} +
+
+ +

{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}

+
+ + +
+ +
+ + {{range .benchmark_categories}} +
+
+
+
+ +
+
+
train-private-1
+
{{svg "octicon-verified" 16}} 运行中
+
CPU:192 核 2048GiB
+
+
+ {{end}} +
+ +
+ + +
+
+ + +
+
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl new file mode 100755 index 000000000..0d15638ad --- /dev/null +++ b/templates/repo/modelarts/trainjob/index.tmpl @@ -0,0 +1,534 @@ + +{{template "base/head" .}} + + + + +
+
+
+
+
+
+
+
+
+ + +
+ +
+ {{template "repo/header" .}} + +
+ + + + +
+ + +
+ + + + 新建训练任务 +
+
+ + +
+
+
+ + + + + +
+ + +
+
+
+ {{$.i18n.Tr "repo.cloudbrain_task"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_status_createtime"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_status_runtime"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_creator"}} +
+
+ {{$.i18n.Tr "repo.cloudbrain_operate"}} +
+ +
+ +
+ + + + {{range .Tasks}} +
+
+ + + + +
+ + + + {{.Status}} + + + {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
+ +
+ + + + {{.TrainJobDuration}} + + + +
+ +
+ {{if .User.Name}} + + {{else}} + + {{end}} +
+ +
+
+ +
+ {{$.CsrfTokenHtml}} + + 停止 + +
+
+
+ + + 模型下载 + + + + + +
+ +
+ {{$.CsrfTokenHtml}} + + 删除 + +
+
+ + + + +
+
+ {{end}} {{template "base/paginate" .}} +
+ +
+
+
+ +
+ +
+
+ + + + +
+ +
+ + +{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/trainjob/list_model.tmpl b/templates/repo/modelarts/trainjob/list_model.tmpl new file mode 100644 index 000000000..a441f5d99 --- /dev/null +++ b/templates/repo/modelarts/trainjob/list_model.tmpl @@ -0,0 +1,47 @@ +{{template "base/head" .}} +
+{{template "repo/header" .}} +
+
+ {{template "base/alert" .}} +

+
+
+ {{$.i18n.Tr "repo.modelarts.version_manage"}} +
+ +
+

+
+
+
+ + {{$.i18n.Tr "repo.modelarts.train_job.version"}} +
+
+
+ {{.ListModel}} +
+ +
+
semantic.json
+
Contains build settings for gulp
+
+
+ +
+
+
+ +
+
+ +
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl new file mode 100755 index 000000000..310ee56b2 --- /dev/null +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -0,0 +1,440 @@ +{{template "base/head" .}} + +
+
{{.i18n.Tr "loading"}}
+
+ +
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + +
+ +
+ + +
+ +
+ +

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ +
+ +
+ + +
+
+ + +
+ +
+ +
+ + + + + +
+
+ + +
+ +
+ + + + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ +
+ + + + + + +
+ + +
+
+ + +
+ + + + +
+ +
+ + + +
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+ + + +
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/modelarts/trainjob/para_manage.tmpl b/templates/repo/modelarts/trainjob/para_manage.tmpl new file mode 100755 index 000000000..b8ac6c78a --- /dev/null +++ b/templates/repo/modelarts/trainjob/para_manage.tmpl @@ -0,0 +1,154 @@ +{{template "base/head" .}} +
+
+ {{template "repo/header" .}} +
+
+ {{template "repo/modelarts/navbar" .}} + +
+
+
+

{{.i18n.Tr "repo.modelarts.train_job_para_admin"}}

+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ + +
+ {{range .Tasks}} +
+
+ + + + +
+ {{.Status}} +
+ + +
+ {{svg "octicon-clock" 16}} {{TimeSinceUnix .CreatedUnix $.Lang}} +
+ + + + + +
+
+
+ {{$.CsrfTokenHtml}} + 删除 +
+
+
+ +
+
+ {{end}} {{template "base/paginate" .}} +
+
+
+
+
+
+
+
+ + +
+ +
+
+{{template "base/footer" .}} + + diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl new file mode 100755 index 000000000..d1fc781d4 --- /dev/null +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -0,0 +1,200 @@ +{{template "base/head" .}} +
+{{template "repo/header" .}} +
+
+ {{template "base/alert" .}} +

+
+
+ {{$.i18n.Tr "repo.modelarts.version_manage"}} +
+ +
+

+ +
+ +
+
+ + {{$.i18n.Tr "repo.modelarts.train_job.version"}} +
+
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}
{{.i18n.Tr "repo.modelarts.train_job.job_name"}} {{.result.JobName}}
{{.i18n.Tr "repo.modelarts.train_job.job_status"}} {{.result.Status}}
{{.i18n.Tr "repo.modelarts.train_job.version"}} {{.result.VersionName}}
{{.i18n.Tr "repo.modelarts.train_job.start_time"}} {{.result.CreateTime}}
{{.i18n.Tr "repo.modelarts.train_job.dura_time"}} {{.result.TrainJobDuration}}
{{.i18n.Tr "repo.modelarts.train_job.description"}} {{.result.Description}}
+
+
+ + + + + + + + + + + + + + + + + + + + + + +
{{.i18n.Tr "repo.modelarts.train_job.parameter_setting_info"}}
{{.i18n.Tr "repo.modelarts.train_job.AI_driver"}} {{.result.EngineName}} | {{.result.EngineVersion}}
{{.i18n.Tr "repo.modelarts.train_job.start_file"}}{{.result.BootFileUrl}}
{{.i18n.Tr "repo.modelarts.train_job.dataset"}} {{.result.DatasetName}}
{{.i18n.Tr "repo.modelarts.train_job.run_parameter"}} {{.result.Parameter}}
+
+
+ + + + + + + + + + + + + + + + + + +
{{.i18n.Tr "repo.modelarts.train_job.resource_setting_info"}}
{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}} {{.result.PoolName}}
{{.i18n.Tr "repo.modelarts.train_job.amount_of_compute_node"}}{{.result.WorkServerNum}}
{{.i18n.Tr "repo.modelarts.train_job.NAS_mount_path"}} {{.result.NasMountPath}}
+
+
+
+
+ +
+
+ + {{.log_file_name}} + + + +
+
+
+
{{.log.Content}}
+
+ +
+
+
+
+
+
+
+
+ +{{template "base/footer" .}} + + \ No newline at end of file diff --git a/vendor/modules.txt b/vendor/modules.txt index dbe4f72b4..d855d421d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -444,6 +444,7 @@ github.com/golang/protobuf/ptypes/timestamp # github.com/golang/snappy v0.0.1 github.com/golang/snappy # github.com/gomodule/redigo v2.0.0+incompatible +## explicit github.com/gomodule/redigo/internal github.com/gomodule/redigo/redis # github.com/google/go-github/v24 v24.0.1 diff --git a/web_src/less/openi.less b/web_src/less/openi.less index cf8ca6d27..aa9d31d76 100644 --- a/web_src/less/openi.less +++ b/web_src/less/openi.less @@ -224,7 +224,7 @@ footer .column{margin-bottom:0!important; padding-bottom:0!important;} // icon cloudbrain .i-round{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;} .i-bg-organ{background-position: -496px -52px;} -.STOPPED{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;background-position: -459px -52px;} +.STOPPED, .KILLED{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;background-position: -459px -52px;} .RUNNING{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;background-position: -478px -52px;} .i-bg-orange{background-position: -495px -51px;} .FAILED{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;background-position: -532px -52px;} @@ -233,6 +233,8 @@ footer .column{margin-bottom:0!important; padding-bottom:0!important;} .icon-bind{background-position: -550px -52px;} .icon-unbind{background-position: -568px -52px;} .CREATING, .STOPPING, .DELETING, .STARTING, .WAITING{display:inline-block;background-image:url('/img/loading.gif');background-repeat:no-repeat;width:16px;height:16px;background-size:16px 16px;margin-right:5px;} + +.COMPLETED{display:inline-block;width:18px;height:18px;background:url("/img/icons.svg");background-position: -496px -52px;background-position: -441px -52px;} .text_over{ overflow: hidden; text-overflow: ellipsis; diff --git a/web_src/less/themes/theme-arc-green.less b/web_src/less/themes/theme-arc-green.less old mode 100644 new mode 100755