Browse Source

Merge pull request 'NPU训练任务增加日志下载功能' (#2253) from download-npu-log into V20220616

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2253
Reviewed-by: liuzx <liuzx@pcl.ac.cn>
tags/v1.22.6.1^2
liuzx 3 years ago
parent
commit
b3b47fb286
6 changed files with 69 additions and 5 deletions
  1. +14
    -0
      modules/storage/obs.go
  2. +1
    -0
      options/locale/locale_en-US.ini
  3. +1
    -0
      options/locale/locale_zh-CN.ini
  4. +34
    -4
      routers/repo/modelarts.go
  5. +1
    -0
      routers/routes/routes.go
  6. +18
    -1
      templates/repo/modelarts/trainjob/show.tmpl

+ 14
- 0
modules/storage/obs.go View File

@@ -564,3 +564,17 @@ func ObsCreateObject(path string) error {

return nil
}

func GetObsLogFileName(prefix string) (string, error) {
input := &obs.ListObjectsInput{}
input.Bucket = setting.Bucket
input.Prefix = prefix

output, err := ObsCli.ListObjects(input)
if err != nil {
log.Error("PutObject failed:", err.Error())
return "", err
}

return output.Contents[0].Key, nil
}

+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -1145,6 +1145,7 @@ modelarts.infer_job.model_version = Model/Version
modelarts.infer_job.select_model = Select Model
modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference. Py, case/main.py.
modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed.
modelarts.download_log=Download log file


debug_task_not_created = Debug task has not been created


+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -1155,6 +1155,7 @@ modelarts.infer_job.model_version = 模型/版本
modelarts.infer_job.select_model = 选择模型
modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。
modelarts.infer_job.tooltip = 该模型已删除,无法查看。
modelarts.download_log=下载日志文件


debug_task_not_created = 未创建过调试任务


+ 34
- 4
routers/repo/modelarts.go View File

@@ -2251,7 +2251,6 @@ func ModelDownload(ctx *context.Context) {
versionName := ctx.Query("version_name")
parentDir := ctx.Query("parent_dir")
fileName := ctx.Query("file_name")
log.Info("DownloadSingleModelFile start.")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
@@ -2259,7 +2258,6 @@ func ModelDownload(ctx *context.Context) {
}

path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
log.Info("Download path is:%s", path)

url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
if err != nil {
@@ -2267,6 +2265,7 @@ func ModelDownload(ctx *context.Context) {
ctx.ServerError("GetObsCreateSignedUrl", err)
return
}
ctx.Resp.Header().Set("Cache-Control", "max-age=0")
http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
}

@@ -2278,13 +2277,11 @@ func ResultDownload(ctx *context.Context) {
versionName := ctx.Query("version_name")
parentDir := ctx.Query("parent_dir")
fileName := ctx.Query("file_name")
log.Info("DownloadResult start.")
task := ctx.Cloudbrain
if err != nil {
ctx.Data["error"] = err.Error()
}
path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
log.Info("Download path is:%s", path)

url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
if err != nil {
@@ -2292,6 +2289,7 @@ func ResultDownload(ctx *context.Context) {
ctx.ServerError("GetObsCreateSignedUrl", err)
return
}
ctx.Resp.Header().Set("Cache-Control", "max-age=0")
http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
}
func DeleteJobStorage(jobName string) error {
@@ -2390,3 +2388,35 @@ func SetJobCount(ctx *context.Context) {
}
ctx.Data["jobCount"] = jobCount
}

func TrainJobDownloadLogFile(ctx *context.Context) {
var (
err error
)

var jobID = ctx.Params(":jobid")
versionName := ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
ctx.ServerError("GetCloudbrainByJobIDAndVersionName", err)
return
}

prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
key, err := storage.GetObsLogFileName(prefix)
if err != nil {
log.Error("GetObsLogFileName(%s) failed:%v", jobID, err.Error(), ctx.Data["msgID"])
ctx.ServerError("GetObsLogFileName", err)
return
}

url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key)
if err != nil {
log.Error("GetObsCreateSignedUrlByBucketAndKey failed: %v", err.Error(), ctx.Data["msgID"])
ctx.ServerError("GetObsCreateSignedUrlByBucketAndKey", err)
return
}
ctx.Resp.Header().Set("Cache-Control", "max-age=0")
http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
}

+ 1
- 0
routers/routes/routes.go View File

@@ -1136,6 +1136,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobStop)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobDel)
m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload)
m.Get("/download_log_file", cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobDownloadLogFile)
m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion)
m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)
})


+ 18
- 1
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -177,6 +177,12 @@
border: 1px solid #dfe1e6;
}

.ti-download-file {
display: flex;
align-items: center;
margin: 0.5rem 0;
}

.disabled {
cursor: default;
pointer-events: none;
@@ -220,6 +226,7 @@
<div class="active section">{{.displayJobName}}</div>
</div>
</h4>

{{range $k ,$v := .version_list_task}}
<div class="ui accordion border-according" id="accordion{{.VersionName}}"
data-repopath="{{$.RepoRelPath}}/modelarts/train-job" data-jobid="{{.JobID}}"
@@ -479,7 +486,17 @@
</div>
</div>
<div class="ui tab" data-tab="second{{$k}}">
<div style="position: relative;">
<div>

<a class='{{if eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED"}}ti-download-file{{else}}disabled{{end}}'
href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/download_log_file?version_name={{.VersionName}}">
<i class="ri-download-cloud-2-line"></i>
<span style="margin-left: 0.3rem;">{{$.i18n.Tr "repo.modelarts.download_log"}}</span>
</a>

</div>
<div
style="position: relative;border: 1px solid rgba(0,0,0,.2);padding: 0 10px;margin-top: 10px;">
<span>
<a title="滚动到顶部" style="position: absolute; right: -32px;cursor: pointer;"
class="log_top" data-version="{{.VersionName}}"><i class="icon-to-top"></i></a>


Loading…
Cancel
Save