| @@ -291,6 +291,13 @@ func (task *Cloudbrain) IsRunning() bool { | |||
| status == string(JobRunning) || status == GrampusStatusRunning | |||
| } | |||
| func (task *Cloudbrain) IsUserHasRight(user *User) bool { | |||
| if user == nil { | |||
| return false | |||
| } | |||
| return user.IsAdmin || user.ID == task.UserID | |||
| } | |||
| func ConvertDurationToStr(duration int64) string { | |||
| if duration <= 0 { | |||
| return DURATION_STR_ZERO | |||
| @@ -12,6 +12,13 @@ const ( | |||
| SpecOffShelf | |||
| ) | |||
| type SearchSpecOrderBy int | |||
| const ( | |||
| SearchSpecOrderById SearchSpecOrderBy = iota | |||
| SearchSpecOrder4Standard | |||
| ) | |||
| type ResourceSpecification struct { | |||
| ID int64 `xorm:"pk autoincr"` | |||
| QueueId int64 `xorm:"INDEX"` | |||
| @@ -85,6 +92,7 @@ type SearchResourceSpecificationOptions struct { | |||
| Status int | |||
| Cluster string | |||
| AvailableCode int | |||
| OrderBy SearchSpecOrderBy | |||
| } | |||
| type SearchResourceBriefSpecificationOptions struct { | |||
| @@ -233,10 +241,18 @@ func SearchResourceSpecification(opts SearchResourceSpecificationOptions) (int64 | |||
| return 0, nil, err | |||
| } | |||
| var orderby = "" | |||
| switch opts.OrderBy { | |||
| case SearchSpecOrder4Standard: | |||
| orderby = "resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc,resource_specification.cpu_cores asc,resource_specification.mem_gi_b asc,resource_specification.share_mem_gi_b asc" | |||
| default: | |||
| orderby = "resource_specification.id desc" | |||
| } | |||
| r := make([]ResourceSpecAndQueue, 0) | |||
| err = x.Where(cond). | |||
| Join("INNER", "resource_queue", "resource_queue.ID = resource_specification.queue_id"). | |||
| Desc("resource_specification.id"). | |||
| OrderBy(orderby). | |||
| Limit(opts.PageSize, (opts.Page-1)*opts.PageSize). | |||
| Unscoped().Find(&r) | |||
| if err != nil { | |||
| @@ -245,6 +245,32 @@ func GetTrainJobLog(jobID string) (string, error) { | |||
| return logContent, nil | |||
| } | |||
| func GetGrampusMetrics(jobID string) (models.GetTrainJobMetricStatisticResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetTrainJobMetricStatisticResult | |||
| res, err := client.R(). | |||
| SetAuthToken(TOKEN). | |||
| Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics") | |||
| if err != nil { | |||
| return result, fmt.Errorf("resty GetTrainJobLog: %v", err) | |||
| } | |||
| if err = json.Unmarshal([]byte(res.String()), &result); err != nil { | |||
| log.Error("GetGrampusMetrics json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| log.Error("Call GrampusMetrics failed(%d):%s(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg) | |||
| return result, fmt.Errorf("Call GrampusMetrics failed(%d):%d(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetGrampusMetrics(%s) failed", jobID) | |||
| return result, fmt.Errorf("GetGrampusMetrics failed:%s", result.ErrorMsg) | |||
| } | |||
| return result, nil | |||
| } | |||
| func StopJob(jobID string) (*models.GrampusStopJobResponse, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| @@ -127,6 +127,7 @@ func GetResourceSpecificationList(ctx *context.Context) { | |||
| Status: status, | |||
| Cluster: cluster, | |||
| AvailableCode: available, | |||
| OrderBy: models.SearchSpecOrderById, | |||
| }) | |||
| if err != nil { | |||
| log.Error("GetResourceSpecificationList error.%v", err) | |||
| @@ -1048,6 +1048,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Get("", repo.GetModelArtsTrainJobVersion) | |||
| m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob) | |||
| m.Get("/log", repo_ext.GrampusGetLog) | |||
| m.Get("/metrics", repo_ext.GrampusMetrics) | |||
| m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog) | |||
| }) | |||
| }) | |||
| @@ -661,14 +661,20 @@ func CloudbrainGetLog(ctx *context.APIContext) { | |||
| if ctx.Data["existStr"] != nil && result["Lines"].(int) < 50 { | |||
| content = content + ctx.Data["existStr"].(string) | |||
| } | |||
| logFileName := result["FileName"] | |||
| //Logs can only be downloaded if the file exists | |||
| //and the current user is an administrator or the creator of the task | |||
| canLogDownload := logFileName != nil && logFileName != "" && job.IsUserHasRight(ctx.User) | |||
| re := map[string]interface{}{ | |||
| "JobID": ID, | |||
| "LogFileName": result["FileName"], | |||
| "LogFileName": logFileName, | |||
| "StartLine": result["StartLine"], | |||
| "EndLine": result["EndLine"], | |||
| "Content": content, | |||
| "Lines": result["Lines"], | |||
| "CanLogDownload": result["FileName"] != "", | |||
| "CanLogDownload": canLogDownload, | |||
| "StartTime": job.StartTime, | |||
| } | |||
| //result := CloudbrainGetLogByJobId(job.JobID, job.JobName) | |||
| @@ -281,15 +281,6 @@ func TrainJobGetLog(ctx *context.APIContext) { | |||
| return | |||
| } | |||
| prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job" | |||
| _, err = storage.GetObsLogFileName(prefix) | |||
| var canLogDownload bool | |||
| if err != nil { | |||
| canLogDownload = false | |||
| } else { | |||
| canLogDownload = true | |||
| } | |||
| ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| @@ -299,11 +290,23 @@ func TrainJobGetLog(ctx *context.APIContext) { | |||
| "EndLine": result.EndLine, | |||
| "Content": result.Content, | |||
| "Lines": result.Lines, | |||
| "CanLogDownload": canLogDownload, | |||
| "CanLogDownload": canLogDownload(ctx.User, task), | |||
| "StartTime": task.StartTime, | |||
| }) | |||
| } | |||
| func canLogDownload(user *models.User, task *models.Cloudbrain) bool { | |||
| if task == nil || !task.IsUserHasRight(user) { | |||
| return false | |||
| } | |||
| prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, task.VersionName), "/") + "/job" | |||
| _, err := storage.GetObsLogFileName(prefix) | |||
| if err != nil { | |||
| return false | |||
| } | |||
| return true | |||
| } | |||
| func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { | |||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10)) | |||
| @@ -940,15 +940,14 @@ func GrampusGetLog(ctx *context.Context) { | |||
| content, err := grampus.GetTrainJobLog(job.JobID) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"]) | |||
| ctx.ServerError(err.Error(), err) | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobName": job.JobName, | |||
| "Content": "", | |||
| "CanLogDownload": false, | |||
| }) | |||
| return | |||
| } | |||
| var canLogDownload bool | |||
| if err != nil { | |||
| canLogDownload = false | |||
| } else { | |||
| canLogDownload = true | |||
| } | |||
| canLogDownload := err == nil && job.IsUserHasRight(ctx.User) | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobName": job.JobName, | |||
| "Content": content, | |||
| @@ -958,6 +957,28 @@ func GrampusGetLog(ctx *context.Context) { | |||
| return | |||
| } | |||
| func GrampusMetrics(ctx *context.Context) { | |||
| jobID := ctx.Params(":jobid") | |||
| job, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) | |||
| ctx.ServerError(err.Error(), err) | |||
| return | |||
| } | |||
| result, err := grampus.GetGrampusMetrics(job.JobID) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"]) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobID": jobID, | |||
| "Interval": result.Interval, | |||
| "MetricsInfo": result.MetricsInfo, | |||
| }) | |||
| return | |||
| } | |||
| func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) { | |||
| var command string | |||
| @@ -14,7 +14,7 @@ import ( | |||
| var noteBookOKMap = make(map[int64]int, 20) | |||
| //if a task notebook url can get two times, the notebook can browser. | |||
| const successfulCount = 2 | |||
| const successfulCount = 3 | |||
| func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error) { | |||
| jobResult, err := cloudbrain.GetJob(task.JobID) | |||
| @@ -138,6 +138,7 @@ func GetResourceSpecificationList(opts models.SearchResourceSpecificationOptions | |||
| func GetAllDistinctResourceSpecification(opts models.SearchResourceSpecificationOptions) (*models.ResourceSpecAndQueueListRes, error) { | |||
| opts.Page = 0 | |||
| opts.PageSize = 1000 | |||
| opts.OrderBy = models.SearchSpecOrder4Standard | |||
| _, r, err := models.SearchResourceSpecification(opts) | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -284,7 +284,8 @@ | |||
| <div class="content-pad"> | |||
| <div class="ui pointing secondary menu" style="border-bottom: 1px solid rgba(34,36,38,.15);"> | |||
| <a class="active item" | |||
| data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | |||
| data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | |||
| <a class="item log_bottom" data-tab="third{{$k}}" | |||
| data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | |||
| <a class="item load-model-file" data-tab="four{{$k}}" data-gpu-flag="true" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a> | |||
| @@ -238,11 +238,8 @@ | |||
| <span> | |||
| <div style="float: right;"> | |||
| {{$.CsrfTokenHtml}} | |||
| </div> | |||
| <div class="ac-display-inblock title_text acc-margin-bottom"> | |||
| <span class="cti-mgRight-sm">{{TimeSinceUnix1 .CreatedUnix}}</span> | |||
| <span class="cti-mgRight-sm"> | |||
| {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}}</span> | |||
| @@ -260,7 +257,6 @@ | |||
| <span class="refresh-status" data-tooltip="刷新" style="cursor: pointer;" data-inverted="" data-version="{{.VersionName}}"> | |||
| <i class="redo icon redo-color"></i> | |||
| </span> | |||
| </div> | |||
| <div style="float: right;"> | |||
| {{if and ($.canDownload) (ne .Status "WAITING") ($.Permission.CanWrite $.UnitTypeModelManage) }} | |||
| @@ -269,7 +265,6 @@ | |||
| {{else}} | |||
| <a class="ti-action-menu-item disabled" id="{{.VersionName}}-create-model">{{$.i18n.Tr "repo.modelarts.create_model"}}</a> | |||
| {{end}} | |||
| </div> | |||
| </span> | |||
| </span> | |||
| @@ -282,6 +277,9 @@ | |||
| <a class="active item" data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | |||
| <a class="item log_bottom" data-tab="second{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | |||
| {{ if eq $.Spec.ComputeResource "NPU"}} | |||
| <a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/grampus/train-job/{{.JobID}}/metrics">{{$.i18n.Tr "cloudbrain.resource_use"}}</a> | |||
| {{end}} | |||
| <a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a> | |||
| </div> | |||
| <div class="ui tab active" data-tab="first{{$k}}"> | |||
| @@ -564,6 +562,14 @@ | |||
| </div> | |||
| </div> | |||
| <div class="ui tab" data-tab="four{{$k}}" style="position: relative;"> | |||
| <i class="ri-refresh-line metric_chart" | |||
| style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;" | |||
| data-version="{{.VersionName}}"></i> | |||
| <div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;"> | |||
| </div> | |||
| </div> | |||
| <div class="ui tab" data-tab="third{{$k}}"> | |||
| <input type="hidden" name="model{{.VersionName}}" value="-1"> | |||
| @@ -321,7 +321,7 @@ | |||
| data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | |||
| <a class="item log_bottom" data-tab="second{{$k}}" | |||
| data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | |||
| <a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "cloudbrain.resource_use"}}</a> | |||
| <a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/modelarts/train-job/{{.JobID}}/metric_statistics?version_name={{.VersionName}}&statistic_type=each&metrics=">{{$.i18n.Tr "cloudbrain.resource_use"}}</a> | |||
| <a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a> | |||
| </div> | |||
| <div class="ui tab active" data-tab="first{{$k}}"> | |||
| @@ -5071,12 +5071,7 @@ function initcreateRepo() { | |||
| initcreateRepo(); | |||
| function initChartsNpu() { | |||
| const url = window.location.href; | |||
| const urlArr = url.split("/"); | |||
| let userName = urlArr.slice(-5)[0]; | |||
| let repoPath = urlArr.slice(-4)[0]; | |||
| let jobID = urlArr.slice(-1)[0]; | |||
| const repoPath = $('.metric_chart').data('path') | |||
| let options = { | |||
| legend: { | |||
| data: [], | |||
| @@ -5127,7 +5122,7 @@ function initChartsNpu() { | |||
| document.getElementById(`metric-${versionName}`) | |||
| ); | |||
| $.get( | |||
| `${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each&metrics=`, | |||
| `${window.config.AppSubUrl}/api/v1/repos/${repoPath}`, | |||
| (res) => { | |||
| let filterDta = res.MetricsInfo.filter((item) => { | |||
| return ![ | |||