diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 451a36c81..21d9768f7 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1150,6 +1150,17 @@ type LogFile struct { Name string } +type GetTrainJobMetricStatisticResult struct { + TrainJobResult + Interval int `json:"interval"` //查询的时间间隔,单位为分钟 + MetricsInfo []Metrics `json:"metrics"` //监控详情 +} + +type Metrics struct { + Metric string `json:"metric"` //监控指标项 + Value []string `json:"value"` //获取的监控值的序列,元素为String类型 +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index 2f7d08c35..961e02538 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -1119,3 +1119,44 @@ sendjob: return &result, nil } + +func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobMetricStatisticResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobMetricStatistic(%s) failed", jobID) + return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg) + } + + return &result, nil +} diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 9a05aa8ae..d6d3b001a 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/del_version", repo.DelTrainJobVersion) m.Post("/stop_version", repo.StopTrainJobVersion) m.Get("/model_list", repo.ModelList) + m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic) }) }) m.Group("/inference-job", func() { diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 9e4edea03..c14976282 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) { "PageIsCloudBrain": true, }) } + +func TrainJobGetMetricStatistic(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + + result, err := trainJobGetMetricStatistic(jobID, versionName) + if err != nil { + log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error()) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "Interval": result.Interval, + "MetricsInfo": result.MetricsInfo, + }) +} + +func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) { + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0]) + if err != nil { + log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + return result, err +} diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index 8f168fcf9..695ae601e 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -305,6 +305,7 @@ data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}} {{$.i18n.Tr "repo.modelarts.log"}} + 资源占用情况 {{$.i18n.Tr "repo.model_download"}} @@ -501,6 +502,13 @@ +
+ +
+
+
diff --git a/web_src/js/index.js b/web_src/js/index.js index 34b5febc5..c51f6638e 100755 --- a/web_src/js/index.js +++ b/web_src/js/index.js @@ -5070,3 +5070,100 @@ function initcreateRepo() { } initcreateRepo() + + +function initChartsNpu() { + const url = window.location.href + const urlArr = url.split('/') + let userName = urlArr.slice(-5)[0] + let repoPath = urlArr.slice(-4)[0] + let jobID = urlArr.slice(-1)[0] + + + let options = { + legend: { + data: [] + }, + grid: { + top: '30%', + bottom: '2%', + containLabel: true + }, + tooltip: { + trigger: 'axis', + backgroundColor: 'rgb(51, 56, 84)', + borderColor: 'rgb(51, 51, 51)', + borderWidth: 0, + textStyle: { + color: '#fff' + }, + axisPointer: { + type: 'line' + } + }, + xAxis: { + type: 'category', + data: [], + boundaryGap: false, + axisLabel: { + interval: 'auto' + }, + name: '时间(min)' + }, + yAxis: { + min: 0, + max: 100, + show: true, + name: '占有率(%)', + axisLine: { + show: true + }, + axisTick: { show: true } + }, + + series: [] + }; + $('.metric_chart').click(function (e) { + let versionName = $(this).data('version') + console.log("11111", versionName) + let myCharts = echarts.init(document.getElementById(`metric-${versionName}`)) + $.get(`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each`, (res) => { + let filterDta = res.MetricsInfo.filter((item) => { + + return !(['recvBytesRate', 'diskWriteRate', 'sendBytesRate', 'diskReadRate'].includes(item.metric)) + }) + let legenData = filterDta.map((item) => { + return item.metric + }) + let seriesData = filterDta.map((item) => { + let seriesOption = { + name: item.metric, + type: 'line', + symbol: 'circle', + symbolSize: 10, + smooth: true, + showSymbol: false, + lineStyle: { + width: 2, + shadowColor: 'rgba(0,0,0,0.3)', + shadowBlur: 10, + shadowOffsetY: 8 + }, + data: item.value + } + return seriesOption + }) + let xLength = res.MetricsInfo[0].value.length + console.log(legenData) + options.xAxis.data = Array.from({ length: xLength }, (_, index) => index + 1) + options.legend.data = legenData + options.series = seriesData + options && myCharts.setOption(options); + + }) + options && myCharts.setOption(options); + + }) +} + +initChartsNpu() \ No newline at end of file