Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2254 Reviewed-by: liuzx <liuzx@pcl.ac.cn>tags/v1.22.6.1^2
| @@ -1150,6 +1150,17 @@ type LogFile struct { | |||||
| Name string | Name string | ||||
| } | } | ||||
| type GetTrainJobMetricStatisticResult struct { | |||||
| TrainJobResult | |||||
| Interval int `json:"interval"` //查询的时间间隔,单位为分钟 | |||||
| MetricsInfo []Metrics `json:"metrics"` //监控详情 | |||||
| } | |||||
| type Metrics struct { | |||||
| Metric string `json:"metric"` //监控指标项 | |||||
| Value []string `json:"value"` //获取的监控值的序列,元素为String类型 | |||||
| } | |||||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | ||||
| sess := x.NewSession() | sess := x.NewSession() | ||||
| defer sess.Close() | defer sess.Close() | ||||
| @@ -1119,3 +1119,44 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetTrainJobMetricStatisticResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic") | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("GetTrainJobMetricStatistic(%s) failed", jobID) | |||||
| return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| @@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||||
| m.Post("/del_version", repo.DelTrainJobVersion) | m.Post("/del_version", repo.DelTrainJobVersion) | ||||
| m.Post("/stop_version", repo.StopTrainJobVersion) | m.Post("/stop_version", repo.StopTrainJobVersion) | ||||
| m.Get("/model_list", repo.ModelList) | m.Get("/model_list", repo.ModelList) | ||||
| m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic) | |||||
| }) | }) | ||||
| }) | }) | ||||
| m.Group("/inference-job", func() { | m.Group("/inference-job", func() { | ||||
| @@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) { | |||||
| "PageIsCloudBrain": true, | "PageIsCloudBrain": true, | ||||
| }) | }) | ||||
| } | } | ||||
| func TrainJobGetMetricStatistic(ctx *context.APIContext) { | |||||
| var ( | |||||
| err error | |||||
| ) | |||||
| var jobID = ctx.Params(":jobid") | |||||
| var versionName = ctx.Query("version_name") | |||||
| result, err := trainJobGetMetricStatistic(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||||
| return | |||||
| } | |||||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||||
| "JobID": jobID, | |||||
| "Interval": result.Interval, | |||||
| "MetricsInfo": result.MetricsInfo, | |||||
| }) | |||||
| } | |||||
| func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||||
| task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
| if err != nil { | |||||
| log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0]) | |||||
| if err != nil { | |||||
| log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| return result, err | |||||
| } | |||||
| @@ -305,6 +305,7 @@ | |||||
| data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a> | ||||
| <a class="item log_bottom" data-tab="second{{$k}}" | <a class="item log_bottom" data-tab="second{{$k}}" | ||||
| data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a> | ||||
| <a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">资源占用情况</a> | |||||
| <a class="item" data-tab="third{{$k}}" | <a class="item" data-tab="third{{$k}}" | ||||
| onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a> | onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a> | ||||
| </div> | </div> | ||||
| @@ -501,6 +502,13 @@ | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| <div class="ui tab" data-tab="four{{$k}}" style="position: relative;"> | |||||
| <i class="ri-refresh-line metric_chart" | |||||
| style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;" | |||||
| data-version="{{.VersionName}}"></i> | |||||
| <div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;"> | |||||
| </div> | |||||
| </div> | |||||
| <div class="ui tab" data-tab="third{{$k}}"> | <div class="ui tab" data-tab="third{{$k}}"> | ||||
| <input type="hidden" name="model{{.VersionName}}" value="-1"> | <input type="hidden" name="model{{.VersionName}}" value="-1"> | ||||
| <input type="hidden" name="modelback{{.VersionName}}" value="-1"> | <input type="hidden" name="modelback{{.VersionName}}" value="-1"> | ||||
| @@ -5070,3 +5070,100 @@ function initcreateRepo() { | |||||
| } | } | ||||
| initcreateRepo() | initcreateRepo() | ||||
| function initChartsNpu() { | |||||
| const url = window.location.href | |||||
| const urlArr = url.split('/') | |||||
| let userName = urlArr.slice(-5)[0] | |||||
| let repoPath = urlArr.slice(-4)[0] | |||||
| let jobID = urlArr.slice(-1)[0] | |||||
| let options = { | |||||
| legend: { | |||||
| data: [] | |||||
| }, | |||||
| grid: { | |||||
| top: '30%', | |||||
| bottom: '2%', | |||||
| containLabel: true | |||||
| }, | |||||
| tooltip: { | |||||
| trigger: 'axis', | |||||
| backgroundColor: 'rgb(51, 56, 84)', | |||||
| borderColor: 'rgb(51, 51, 51)', | |||||
| borderWidth: 0, | |||||
| textStyle: { | |||||
| color: '#fff' | |||||
| }, | |||||
| axisPointer: { | |||||
| type: 'line' | |||||
| } | |||||
| }, | |||||
| xAxis: { | |||||
| type: 'category', | |||||
| data: [], | |||||
| boundaryGap: false, | |||||
| axisLabel: { | |||||
| interval: 'auto' | |||||
| }, | |||||
| name: '时间(min)' | |||||
| }, | |||||
| yAxis: { | |||||
| min: 0, | |||||
| max: 100, | |||||
| show: true, | |||||
| name: '占有率(%)', | |||||
| axisLine: { | |||||
| show: true | |||||
| }, | |||||
| axisTick: { show: true } | |||||
| }, | |||||
| series: [] | |||||
| }; | |||||
| $('.metric_chart').click(function (e) { | |||||
| let versionName = $(this).data('version') | |||||
| console.log("11111", versionName) | |||||
| let myCharts = echarts.init(document.getElementById(`metric-${versionName}`)) | |||||
| $.get(`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each`, (res) => { | |||||
| let filterDta = res.MetricsInfo.filter((item) => { | |||||
| return !(['recvBytesRate', 'diskWriteRate', 'sendBytesRate', 'diskReadRate'].includes(item.metric)) | |||||
| }) | |||||
| let legenData = filterDta.map((item) => { | |||||
| return item.metric | |||||
| }) | |||||
| let seriesData = filterDta.map((item) => { | |||||
| let seriesOption = { | |||||
| name: item.metric, | |||||
| type: 'line', | |||||
| symbol: 'circle', | |||||
| symbolSize: 10, | |||||
| smooth: true, | |||||
| showSymbol: false, | |||||
| lineStyle: { | |||||
| width: 2, | |||||
| shadowColor: 'rgba(0,0,0,0.3)', | |||||
| shadowBlur: 10, | |||||
| shadowOffsetY: 8 | |||||
| }, | |||||
| data: item.value | |||||
| } | |||||
| return seriesOption | |||||
| }) | |||||
| let xLength = res.MetricsInfo[0].value.length | |||||
| console.log(legenData) | |||||
| options.xAxis.data = Array.from({ length: xLength }, (_, index) => index + 1) | |||||
| options.legend.data = legenData | |||||
| options.series = seriesData | |||||
| options && myCharts.setOption(options); | |||||
| }) | |||||
| options && myCharts.setOption(options); | |||||
| }) | |||||
| } | |||||
| initChartsNpu() | |||||