Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2254 Reviewed-by: liuzx <liuzx@pcl.ac.cn>tags/v1.22.6.1^2
| @@ -1150,6 +1150,17 @@ type LogFile struct { | |||
| Name string | |||
| } | |||
| type GetTrainJobMetricStatisticResult struct { | |||
| TrainJobResult | |||
| Interval int `json:"interval"` //查询的时间间隔,单位为分钟 | |||
| MetricsInfo []Metrics `json:"metrics"` //监控详情 | |||
| } | |||
| type Metrics struct { | |||
| Metric string `json:"metric"` //监控指标项 | |||
| Value []string `json:"value"` //获取的监控值的序列,元素为String类型 | |||
| } | |||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| @@ -1119,3 +1119,44 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetTrainJobMetricStatisticResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic") | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetTrainJobMetricStatistic(%s) failed", jobID) | |||
| return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Post("/del_version", repo.DelTrainJobVersion) | |||
| m.Post("/stop_version", repo.StopTrainJobVersion) | |||
| m.Get("/model_list", repo.ModelList) | |||
| m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic) | |||
| }) | |||
| }) | |||
| m.Group("/inference-job", func() { | |||
| @@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) { | |||
| "PageIsCloudBrain": true, | |||
| }) | |||
| } | |||
| func TrainJobGetMetricStatistic(ctx *context.APIContext) { | |||
| var ( | |||
| err error | |||
| ) | |||
| var jobID = ctx.Params(":jobid") | |||
| var versionName = ctx.Query("version_name") | |||
| result, err := trainJobGetMetricStatistic(jobID, versionName) | |||
| if err != nil { | |||
| log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||
| return | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobID": jobID, | |||
| "Interval": result.Interval, | |||
| "MetricsInfo": result.MetricsInfo, | |||
| }) | |||
| } | |||
| func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||
| task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) | |||
| return nil, err | |||
| } | |||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | |||
| return nil, err | |||
| } | |||
| result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0]) | |||
| if err != nil { | |||
| log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||
| return nil, err | |||
| } | |||
| return result, err | |||
| } | |||
| @@ -305,6 +305,7 @@ | |||
| data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | |||
| <a class="item log_bottom" data-tab="second{{$k}}" | |||
| data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | |||
| <a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">资源占用情况</a> | |||
| <a class="item" data-tab="third{{$k}}" | |||
| onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a> | |||
| </div> | |||
| @@ -501,6 +502,13 @@ | |||
| </div> | |||
| </div> | |||
| <div class="ui tab" data-tab="four{{$k}}" style="position: relative;"> | |||
| <i class="ri-refresh-line metric_chart" | |||
| style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;" | |||
| data-version="{{.VersionName}}"></i> | |||
| <div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;"> | |||
| </div> | |||
| </div> | |||
| <div class="ui tab" data-tab="third{{$k}}"> | |||
| <input type="hidden" name="model{{.VersionName}}" value="-1"> | |||
| <input type="hidden" name="modelback{{.VersionName}}" value="-1"> | |||
| @@ -5070,3 +5070,100 @@ function initcreateRepo() { | |||
| } | |||
| initcreateRepo() | |||
| function initChartsNpu() { | |||
| const url = window.location.href | |||
| const urlArr = url.split('/') | |||
| let userName = urlArr.slice(-5)[0] | |||
| let repoPath = urlArr.slice(-4)[0] | |||
| let jobID = urlArr.slice(-1)[0] | |||
| let options = { | |||
| legend: { | |||
| data: [] | |||
| }, | |||
| grid: { | |||
| top: '30%', | |||
| bottom: '2%', | |||
| containLabel: true | |||
| }, | |||
| tooltip: { | |||
| trigger: 'axis', | |||
| backgroundColor: 'rgb(51, 56, 84)', | |||
| borderColor: 'rgb(51, 51, 51)', | |||
| borderWidth: 0, | |||
| textStyle: { | |||
| color: '#fff' | |||
| }, | |||
| axisPointer: { | |||
| type: 'line' | |||
| } | |||
| }, | |||
| xAxis: { | |||
| type: 'category', | |||
| data: [], | |||
| boundaryGap: false, | |||
| axisLabel: { | |||
| interval: 'auto' | |||
| }, | |||
| name: '时间(min)' | |||
| }, | |||
| yAxis: { | |||
| min: 0, | |||
| max: 100, | |||
| show: true, | |||
| name: '占有率(%)', | |||
| axisLine: { | |||
| show: true | |||
| }, | |||
| axisTick: { show: true } | |||
| }, | |||
| series: [] | |||
| }; | |||
| $('.metric_chart').click(function (e) { | |||
| let versionName = $(this).data('version') | |||
| console.log("11111", versionName) | |||
| let myCharts = echarts.init(document.getElementById(`metric-${versionName}`)) | |||
| $.get(`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each`, (res) => { | |||
| let filterDta = res.MetricsInfo.filter((item) => { | |||
| return !(['recvBytesRate', 'diskWriteRate', 'sendBytesRate', 'diskReadRate'].includes(item.metric)) | |||
| }) | |||
| let legenData = filterDta.map((item) => { | |||
| return item.metric | |||
| }) | |||
| let seriesData = filterDta.map((item) => { | |||
| let seriesOption = { | |||
| name: item.metric, | |||
| type: 'line', | |||
| symbol: 'circle', | |||
| symbolSize: 10, | |||
| smooth: true, | |||
| showSymbol: false, | |||
| lineStyle: { | |||
| width: 2, | |||
| shadowColor: 'rgba(0,0,0,0.3)', | |||
| shadowBlur: 10, | |||
| shadowOffsetY: 8 | |||
| }, | |||
| data: item.value | |||
| } | |||
| return seriesOption | |||
| }) | |||
| let xLength = res.MetricsInfo[0].value.length | |||
| console.log(legenData) | |||
| options.xAxis.data = Array.from({ length: xLength }, (_, index) => index + 1) | |||
| options.legend.data = legenData | |||
| options.series = seriesData | |||
| options && myCharts.setOption(options); | |||
| }) | |||
| options && myCharts.setOption(options); | |||
| }) | |||
| } | |||
| initChartsNpu() | |||