Browse Source

fix issue

tags/v1.22.11.2^2
zhoupzh 3 years ago
parent
commit
5713e2936d
14 changed files with 119 additions and 35 deletions
  1. +7
    -0
      models/cloudbrain.go
  2. +17
    -1
      models/resource_specification.go
  3. +26
    -0
      modules/grampus/resty.go
  4. +1
    -0
      routers/admin/resources.go
  5. +1
    -0
      routers/api/v1/api.go
  6. +8
    -2
      routers/api/v1/repo/cloudbrain.go
  7. +13
    -10
      routers/api/v1/repo/modelarts.go
  8. +28
    -7
      routers/repo/grampus.go
  9. +1
    -1
      services/cloudbrain/cloudbrainTask/sync_status.go
  10. +1
    -0
      services/cloudbrain/resource/resource_specification.go
  11. +2
    -1
      templates/repo/cloudbrain/trainjob/show.tmpl
  12. +11
    -5
      templates/repo/grampus/trainjob/show.tmpl
  13. +1
    -1
      templates/repo/modelarts/trainjob/show.tmpl
  14. +2
    -7
      web_src/js/index.js

+ 7
- 0
models/cloudbrain.go View File

@@ -291,6 +291,13 @@ func (task *Cloudbrain) IsRunning() bool {
status == string(JobRunning) || status == GrampusStatusRunning
}

func (task *Cloudbrain) IsUserHasRight(user *User) bool {
if user == nil {
return false
}
return user.IsAdmin || user.ID == task.UserID
}

func ConvertDurationToStr(duration int64) string {
if duration <= 0 {
return DURATION_STR_ZERO


+ 17
- 1
models/resource_specification.go View File

@@ -12,6 +12,13 @@ const (
SpecOffShelf
)

type SearchSpecOrderBy int

const (
SearchSpecOrderById SearchSpecOrderBy = iota
SearchSpecOrder4Standard
)

type ResourceSpecification struct {
ID int64 `xorm:"pk autoincr"`
QueueId int64 `xorm:"INDEX"`
@@ -85,6 +92,7 @@ type SearchResourceSpecificationOptions struct {
Status int
Cluster string
AvailableCode int
OrderBy SearchSpecOrderBy
}

type SearchResourceBriefSpecificationOptions struct {
@@ -233,10 +241,18 @@ func SearchResourceSpecification(opts SearchResourceSpecificationOptions) (int64
return 0, nil, err
}

var orderby = ""
switch opts.OrderBy {
case SearchSpecOrder4Standard:
orderby = "resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc,resource_specification.cpu_cores asc,resource_specification.mem_gi_b asc,resource_specification.share_mem_gi_b asc"
default:
orderby = "resource_specification.id desc"
}

r := make([]ResourceSpecAndQueue, 0)
err = x.Where(cond).
Join("INNER", "resource_queue", "resource_queue.ID = resource_specification.queue_id").
Desc("resource_specification.id").
OrderBy(orderby).
Limit(opts.PageSize, (opts.Page-1)*opts.PageSize).
Unscoped().Find(&r)
if err != nil {


+ 26
- 0
modules/grampus/resty.go View File

@@ -245,6 +245,32 @@ func GetTrainJobLog(jobID string) (string, error) {
return logContent, nil
}

func GetGrampusMetrics(jobID string) (models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult
res, err := client.R().
SetAuthToken(TOKEN).
Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics")

if err != nil {
return result, fmt.Errorf("resty GetTrainJobLog: %v", err)
}
if err = json.Unmarshal([]byte(res.String()), &result); err != nil {
log.Error("GetGrampusMetrics json.Unmarshal failed(%s): %v", res.String(), err.Error())
return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
if res.StatusCode() != http.StatusOK {
log.Error("Call GrampusMetrics failed(%d):%s(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg)
return result, fmt.Errorf("Call GrampusMetrics failed(%d):%d(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg)
}
if !result.IsSuccess {
log.Error("GetGrampusMetrics(%s) failed", jobID)
return result, fmt.Errorf("GetGrampusMetrics failed:%s", result.ErrorMsg)
}
return result, nil
}

func StopJob(jobID string) (*models.GrampusStopJobResponse, error) {
checkSetting()
client := getRestyClient()


+ 1
- 0
routers/admin/resources.go View File

@@ -127,6 +127,7 @@ func GetResourceSpecificationList(ctx *context.Context) {
Status: status,
Cluster: cluster,
AvailableCode: available,
OrderBy: models.SearchSpecOrderById,
})
if err != nil {
log.Error("GetResourceSpecificationList error.%v", err)


+ 1
- 0
routers/api/v1/api.go View File

@@ -1048,6 +1048,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("", repo.GetModelArtsTrainJobVersion)
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
})
})


+ 8
- 2
routers/api/v1/repo/cloudbrain.go View File

@@ -661,14 +661,20 @@ func CloudbrainGetLog(ctx *context.APIContext) {
if ctx.Data["existStr"] != nil && result["Lines"].(int) < 50 {
content = content + ctx.Data["existStr"].(string)
}
logFileName := result["FileName"]

//Logs can only be downloaded if the file exists
//and the current user is an administrator or the creator of the task
canLogDownload := logFileName != nil && logFileName != "" && job.IsUserHasRight(ctx.User)

re := map[string]interface{}{
"JobID": ID,
"LogFileName": result["FileName"],
"LogFileName": logFileName,
"StartLine": result["StartLine"],
"EndLine": result["EndLine"],
"Content": content,
"Lines": result["Lines"],
"CanLogDownload": result["FileName"] != "",
"CanLogDownload": canLogDownload,
"StartTime": job.StartTime,
}
//result := CloudbrainGetLogByJobId(job.JobID, job.JobName)


+ 13
- 10
routers/api/v1/repo/modelarts.go View File

@@ -281,15 +281,6 @@ func TrainJobGetLog(ctx *context.APIContext) {
return
}

prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
_, err = storage.GetObsLogFileName(prefix)
var canLogDownload bool
if err != nil {
canLogDownload = false
} else {
canLogDownload = true
}

ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]

ctx.JSON(http.StatusOK, map[string]interface{}{
@@ -299,11 +290,23 @@ func TrainJobGetLog(ctx *context.APIContext) {
"EndLine": result.EndLine,
"Content": result.Content,
"Lines": result.Lines,
"CanLogDownload": canLogDownload,
"CanLogDownload": canLogDownload(ctx.User, task),
"StartTime": task.StartTime,
})
}

func canLogDownload(user *models.User, task *models.Cloudbrain) bool {
if task == nil || !task.IsUserHasRight(user) {
return false
}
prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, task.VersionName), "/") + "/job"
_, err := storage.GetObsLogFileName(prefix)
if err != nil {
return false
}
return true
}

func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10))


+ 28
- 7
routers/repo/grampus.go View File

@@ -940,15 +940,14 @@ func GrampusGetLog(ctx *context.Context) {
content, err := grampus.GetTrainJobLog(job.JobID)
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
ctx.ServerError(err.Error(), err)
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobName": job.JobName,
"Content": "",
"CanLogDownload": false,
})
return
}
var canLogDownload bool
if err != nil {
canLogDownload = false
} else {
canLogDownload = true
}
canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobName": job.JobName,
"Content": content,
@@ -958,6 +957,28 @@ func GrampusGetLog(ctx *context.Context) {
return
}

func GrampusMetrics(ctx *context.Context) {
jobID := ctx.Params(":jobid")
job, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
ctx.ServerError(err.Error(), err)
return
}

result, err := grampus.GetGrampusMetrics(job.JobID)
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
}
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"Interval": result.Interval,
"MetricsInfo": result.MetricsInfo,
})

return
}

func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
var command string



+ 1
- 1
services/cloudbrain/cloudbrainTask/sync_status.go View File

@@ -14,7 +14,7 @@ import (
var noteBookOKMap = make(map[int64]int, 20)

//if a task notebook url can get two times, the notebook can browser.
const successfulCount = 2
const successfulCount = 3

func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error) {
jobResult, err := cloudbrain.GetJob(task.JobID)


+ 1
- 0
services/cloudbrain/resource/resource_specification.go View File

@@ -138,6 +138,7 @@ func GetResourceSpecificationList(opts models.SearchResourceSpecificationOptions
func GetAllDistinctResourceSpecification(opts models.SearchResourceSpecificationOptions) (*models.ResourceSpecAndQueueListRes, error) {
opts.Page = 0
opts.PageSize = 1000
opts.OrderBy = models.SearchSpecOrder4Standard
_, r, err := models.SearchResourceSpecification(opts)
if err != nil {
return nil, err


+ 2
- 1
templates/repo/cloudbrain/trainjob/show.tmpl View File

@@ -284,7 +284,8 @@
<div class="content-pad">
<div class="ui pointing secondary menu" style="border-bottom: 1px solid rgba(34,36,38,.15);">
<a class="active item"
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>

<a class="item log_bottom" data-tab="third{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item load-model-file" data-tab="four{{$k}}" data-gpu-flag="true" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>


+ 11
- 5
templates/repo/grampus/trainjob/show.tmpl View File

@@ -238,11 +238,8 @@
<span>
<div style="float: right;">
{{$.CsrfTokenHtml}}
</div>
<div class="ac-display-inblock title_text acc-margin-bottom">

<span class="cti-mgRight-sm">{{TimeSinceUnix1 .CreatedUnix}}</span>
<span class="cti-mgRight-sm">
{{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}}</span>
@@ -260,7 +257,6 @@
<span class="refresh-status" data-tooltip="刷新" style="cursor: pointer;" data-inverted="" data-version="{{.VersionName}}">
<i class="redo icon redo-color"></i>
</span>

</div>
<div style="float: right;">
{{if and ($.canDownload) (ne .Status "WAITING") ($.Permission.CanWrite $.UnitTypeModelManage) }}
@@ -269,7 +265,6 @@
{{else}}
<a class="ti-action-menu-item disabled" id="{{.VersionName}}-create-model">{{$.i18n.Tr "repo.modelarts.create_model"}}</a>
{{end}}

</div>
</span>
</span>
@@ -282,6 +277,9 @@

<a class="active item" data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
{{ if eq $.Spec.ComputeResource "NPU"}}
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/grampus/train-job/{{.JobID}}/metrics">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
{{end}}
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">
@@ -564,6 +562,14 @@

</div>

</div>
<div class="ui tab" data-tab="four{{$k}}" style="position: relative;">
<i class="ri-refresh-line metric_chart"
style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;"
data-version="{{.VersionName}}"></i>
<div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;">
</div>
</div>
<div class="ui tab" data-tab="third{{$k}}">
<input type="hidden" name="model{{.VersionName}}" value="-1">


+ 1
- 1
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -321,7 +321,7 @@
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/modelarts/train-job/{{.JobID}}/metric_statistics?version_name={{.VersionName}}&statistic_type=each&metrics=">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">


+ 2
- 7
web_src/js/index.js View File

@@ -5071,12 +5071,7 @@ function initcreateRepo() {
initcreateRepo();

function initChartsNpu() {
const url = window.location.href;
const urlArr = url.split("/");
let userName = urlArr.slice(-5)[0];
let repoPath = urlArr.slice(-4)[0];
let jobID = urlArr.slice(-1)[0];

const repoPath = $('.metric_chart').data('path')
let options = {
legend: {
data: [],
@@ -5127,7 +5122,7 @@ function initChartsNpu() {
document.getElementById(`metric-${versionName}`)
);
$.get(
`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each&metrics=`,
`${window.config.AppSubUrl}/api/v1/repos/${repoPath}`,
(res) => {
let filterDta = res.MetricsInfo.filter((item) => {
return ![


Loading…
Cancel
Save