Browse Source

Merge pull request '增加NPU训练任务资源占用情况的查看' (#2254) from show-npu-metric into V20220616

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2254
Reviewed-by: liuzx <liuzx@pcl.ac.cn>
tags/v1.22.6.1^2
liuzx 3 years ago
parent
commit
b9be140ac3
6 changed files with 201 additions and 0 deletions
  1. +11
    -0
      models/cloudbrain.go
  2. +41
    -0
      modules/modelarts/resty.go
  3. +1
    -0
      routers/api/v1/api.go
  4. +43
    -0
      routers/api/v1/repo/modelarts.go
  5. +8
    -0
      templates/repo/modelarts/trainjob/show.tmpl
  6. +97
    -0
      web_src/js/index.js

+ 11
- 0
models/cloudbrain.go View File

@@ -1150,6 +1150,17 @@ type LogFile struct {
Name string
}

type GetTrainJobMetricStatisticResult struct {
TrainJobResult
Interval int `json:"interval"` //查询的时间间隔,单位为分钟
MetricsInfo []Metrics `json:"metrics"` //监控详情
}

type Metrics struct {
Metric string `json:"metric"` //监控指标项
Value []string `json:"value"` //获取的监控值的序列,元素为String类型
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()


+ 41
- 0
modules/modelarts/resty.go View File

@@ -1119,3 +1119,44 @@ sendjob:

return &result, nil
}

func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobMetricStatistic(%s) failed", jobID)
return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg)
}

return &result, nil
}

+ 1
- 0
routers/api/v1/api.go View File

@@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/del_version", repo.DelTrainJobVersion)
m.Post("/stop_version", repo.StopTrainJobVersion)
m.Get("/model_list", repo.ModelList)
m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic)
})
})
m.Group("/inference-job", func() {


+ 43
- 0
routers/api/v1/repo/modelarts.go View File

@@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) {
"PageIsCloudBrain": true,
})
}

func TrainJobGetMetricStatistic(ctx *context.APIContext) {
var (
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")

result, err := trainJobGetMetricStatistic(jobID, versionName)
if err != nil {
log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error())
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"Interval": result.Interval,
"MetricsInfo": result.MetricsInfo,
})
}

func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) {
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
return nil, err
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
return nil, err
}

result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
if err != nil {
log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error())
return nil, err
}

return result, err
}

+ 8
- 0
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -305,6 +305,7 @@
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">资源占用情况</a>
<a class="item" data-tab="third{{$k}}"
onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a>
</div>
@@ -501,6 +502,13 @@
</div>

</div>
<div class="ui tab" data-tab="four{{$k}}" style="position: relative;">
<i class="ri-refresh-line metric_chart"
style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;"
data-version="{{.VersionName}}"></i>
<div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;">
</div>
</div>
<div class="ui tab" data-tab="third{{$k}}">
<input type="hidden" name="model{{.VersionName}}" value="-1">
<input type="hidden" name="modelback{{.VersionName}}" value="-1">


+ 97
- 0
web_src/js/index.js View File

@@ -5070,3 +5070,100 @@ function initcreateRepo() {
}

initcreateRepo()


function initChartsNpu() {
const url = window.location.href
const urlArr = url.split('/')
let userName = urlArr.slice(-5)[0]
let repoPath = urlArr.slice(-4)[0]
let jobID = urlArr.slice(-1)[0]


let options = {
legend: {
data: []
},
grid: {
top: '30%',
bottom: '2%',
containLabel: true
},
tooltip: {
trigger: 'axis',
backgroundColor: 'rgb(51, 56, 84)',
borderColor: 'rgb(51, 51, 51)',
borderWidth: 0,
textStyle: {
color: '#fff'
},
axisPointer: {
type: 'line'
}
},
xAxis: {
type: 'category',
data: [],
boundaryGap: false,
axisLabel: {
interval: 'auto'
},
name: '时间(min)'
},
yAxis: {
min: 0,
max: 100,
show: true,
name: '占有率(%)',
axisLine: {
show: true
},
axisTick: { show: true }
},

series: []
};
$('.metric_chart').click(function (e) {
let versionName = $(this).data('version')
console.log("11111", versionName)
let myCharts = echarts.init(document.getElementById(`metric-${versionName}`))
$.get(`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each`, (res) => {
let filterDta = res.MetricsInfo.filter((item) => {

return !(['recvBytesRate', 'diskWriteRate', 'sendBytesRate', 'diskReadRate'].includes(item.metric))
})
let legenData = filterDta.map((item) => {
return item.metric
})
let seriesData = filterDta.map((item) => {
let seriesOption = {
name: item.metric,
type: 'line',
symbol: 'circle',
symbolSize: 10,
smooth: true,
showSymbol: false,
lineStyle: {
width: 2,
shadowColor: 'rgba(0,0,0,0.3)',
shadowBlur: 10,
shadowOffsetY: 8
},
data: item.value
}
return seriesOption
})
let xLength = res.MetricsInfo[0].value.length
console.log(legenData)
options.xAxis.data = Array.from({ length: xLength }, (_, index) => index + 1)
options.legend.data = legenData
options.series = seriesData
options && myCharts.setOption(options);

})
options && myCharts.setOption(options);

})
}

initChartsNpu()

Loading…
Cancel
Save