Browse Source

Merge pull request 'fix-2251' (#3344) from fix-2251 into V20221214

Reviewed-on: https://openi.pcl.ac.cn/OpenI/aiforge/pulls/3344
Reviewed-by: zouap <zouap@pcl.ac.cn>
tags/v1.22.12.1^2
zouap 3 years ago
parent
commit
8278358049
7 changed files with 291 additions and 2 deletions
  1. +84
    -0
      models/cloudbrain.go
  2. +17
    -2
      modules/cron/tasks_basic.go
  3. +24
    -0
      modules/setting/setting.go
  4. +1
    -0
      options/locale/locale_en-US.ini
  5. +2
    -0
      options/locale/locale_zh-CN.ini
  6. +12
    -0
      routers/repo/cloudbrain.go
  7. +151
    -0
      services/cloudbrain/clear.go

+ 84
- 0
models/cloudbrain.go View File

@@ -204,6 +204,7 @@ type Cloudbrain struct {
BenchmarkTypeRankLink string `xorm:"-"`
StartTime timeutil.TimeStamp
EndTime timeutil.TimeStamp
Cleared bool `xorm:"DEFAULT false"`
Spec *Specification `xorm:"-"`
}

@@ -1905,6 +1906,12 @@ func GetCloudbrainByID(id string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}

func IsCloudbrainExistByJobName(jobName string)(bool,error){
return x.Unscoped().Exist(&Cloudbrain{
JobName: jobName,
})
}

func GetCloudbrainByIDWithDeleted(id string) (*Cloudbrain, error) {
idInt64, _ := strconv.ParseInt(id, 10, 64)
cb := &Cloudbrain{ID: idInt64}
@@ -2050,6 +2057,83 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) {
Find(&cloudbrains)
}

func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
return cloudbrains, x.Unscoped().Cols("id,job_name,job_id").
In("status",
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted,
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed,
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Limit(limit).
Find(&cloudbrains)
}
/**
本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间
*/
func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
sql:=`SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name)
id, job_name, job_id,status,end_time,updated_unix,cleared
FROM cloudbrain
where type=0 and job_type='DEBUG'
ORDER BY job_name, updated_unix DESC) a
where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false`

return cloudbrains, x.Unscoped().SQL(sql,missEndTimeBefore, endTimeBefore).Limit(limit).Find(&cloudbrains)

}


func UpdateCloudBrainRecordsCleared(ids []int64) error {
pageSize := 150
n := len(ids) / pageSize

var err error

for i := 1; i <= n+1; i++ {
tempIds := getPageIds(ids, i, pageSize)
if len(tempIds) > 0 {
idsIn := ""
for i, id := range tempIds {
if i == 0 {
idsIn += strconv.FormatInt(id, 10)
} else {
idsIn += "," + strconv.FormatInt(id, 10)
}
}

_, errTemp := x.Unscoped().Exec("update cloudbrain set cleared=true where id in (" + idsIn + ")")
if errTemp != nil {
err = errTemp
}

}

}
return err

}

func getPageIds(ids []int64, page int, pagesize int) []int64 {
begin := (page - 1) * pagesize
end := (page) * pagesize

if begin > len(ids)-1 {
return []int64{}
}
if end > len(ids)-1 {
return ids[begin:]
} else {
return ids[begin:end]
}

}

func GetStoppedJobWithNoDurationJob() ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0)
return cloudbrains, x.


+ 17
- 2
modules/cron/tasks_basic.go View File

@@ -5,10 +5,13 @@
package cron

import (
"code.gitea.io/gitea/modules/urfs_client/urchin"
"code.gitea.io/gitea/modules/setting"
"context"
"time"

"code.gitea.io/gitea/modules/urfs_client/urchin"
cloudbrainService "code.gitea.io/gitea/services/cloudbrain"

"code.gitea.io/gitea/modules/modelarts"
"code.gitea.io/gitea/services/cloudbrain/resource"
"code.gitea.io/gitea/services/reward"
@@ -190,6 +193,17 @@ func registerHandleRepoAndUserStatistic() {
})
}

func registerHandleClearCloudbrainResult() {
RegisterTaskFatal("handle_cloudbrain_one_result_clear", &BaseConfig{
Enabled: true,
RunAtStart: setting.ClearStrategy.RunAtStart,
Schedule: setting.ClearStrategy.Cron,
}, func(ctx context.Context, _ *models.User, _ Config) error {
cloudbrainService.ClearCloudbrainResultSpace()
return nil
})
}

func registerHandleSummaryStatistic() {
RegisterTaskFatal("handle_summary_statistic", &BaseConfig{
Enabled: true,
@@ -306,6 +320,7 @@ func initBasicTasks() {

registerHandleRepoAndUserStatistic()
registerHandleSummaryStatistic()
registerHandleClearCloudbrainResult()

registerSyncCloudbrainStatus()
registerHandleOrgStatistic()
@@ -317,6 +332,6 @@ func initBasicTasks() {

registerHandleModelSafetyTask()

registerHandleScheduleRecord()
registerHandleScheduleRecord()
registerHandleCloudbrainDurationStatistic()
}

+ 24
- 0
modules/setting/setting.go View File

@@ -519,6 +519,7 @@ var (
CullIdleTimeout string
CullInterval string


//benchmark config
IsBenchmarkEnabled bool
BenchmarkOwner string
@@ -613,6 +614,16 @@ var (
UsageRateBeginTime string
}{}

ClearStrategy= struct {
Enabled bool
ResultSaveDays int
BatchSize int
DebugJobSize int
TrashSaveDays int
Cron string
RunAtStart bool
}{}

C2NetInfos *C2NetSqInfos
CenterInfos *AiCenterInfos
C2NetMapInfo map[string]*C2NetSequenceInfo
@@ -1619,6 +1630,7 @@ func NewContext() {
getModelConvertConfig()
getModelSafetyConfig()
getModelAppConfig()
getClearStrategy()
}

func getModelSafetyConfig() {
@@ -1679,6 +1691,18 @@ func getModelartsCDConfig() {
getNotebookFlavorInfos()
}

func getClearStrategy(){

sec := Cfg.Section("clear_strategy")
ClearStrategy.Enabled=sec.Key("ENABLED").MustBool(false)
ClearStrategy.ResultSaveDays=sec.Key("RESULT_SAVE_DAYS").MustInt(30)
ClearStrategy.BatchSize=sec.Key("BATCH_SIZE").MustInt(500)
ClearStrategy.DebugJobSize=sec.Key("DEBUG_BATCH_SIZE").MustInt(100)
ClearStrategy.TrashSaveDays=sec.Key("TRASH_SAVE_DAYS").MustInt(90)
ClearStrategy.Cron=sec.Key("CRON").MustString("* 0,30 2-8 * * ?")
ClearStrategy.RunAtStart=sec.Key("RUN_AT_START").MustBool(false)
}

func getGrampusConfig() {
sec := Cfg.Section("grampus")



+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -3246,6 +3246,7 @@ specification = specification
select_specification = select specification
description = description
wrong_specification=You cannot use this specification, please choose another item.
result_cleared=The files of the task have been cleared, can not restart any more, please create a new debug task instead.
resource_use=Resource Occupancy

job_name_rule = Please enter letters, numbers, _ and - up to 64 characters and cannot end with a dash (-).


+ 2
- 0
options/locale/locale_zh-CN.ini View File

@@ -3266,6 +3266,8 @@ card_duration = 运行卡时
card_type = 卡类型
wrong_specification=您目前不能使用这个资源规格,请选择其他资源规格。

result_cleared=本任务的文件已被清理,无法再次调试,请新建调试任务。

job_name_rule = 请输入字母、数字、_和-,最长64个字符,且不能以中划线(-)结尾。
train_dataset_path_rule = 数据集位置存储在运行参数 <strong style="color:#010101">data_url</strong> 中,预训练模型存放在运行参数 <strong style="color:#010101">ckpt_url</strong> 中,训练输出路径存储在运行参数 <strong style="color:#010101">train_url</strong> 中。
infer_dataset_path_rule = 数据集位置存储在运行参数 <strong style="color:#010101">data_url</strong> 中,推理输出路径存储在运行参数 <strong style="color:#010101">result_url</strong> 中。


+ 12
- 0
routers/repo/cloudbrain.go View File

@@ -670,6 +670,13 @@ func CloudBrainRestart(ctx *context.Context) {
break
}

if _, err := os.Stat(getOldJobPath(task)); err != nil {
log.Error("Can not find job minio path", err)
resultCode = "-1"
errorMsg = ctx.Tr("cloudbrain.result_cleared")
break
}

count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, string(models.JobTypeDebug))
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -704,6 +711,11 @@ func CloudBrainRestart(ctx *context.Context) {
})
}


func getOldJobPath(task *models.Cloudbrain) string {
return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + task.JobName
}

func CloudBrainBenchMarkShow(ctx *context.Context) {
cloudBrainShow(ctx, tplCloudBrainBenchmarkShow, models.JobTypeBenchmark)
}


+ 151
- 0
services/cloudbrain/clear.go View File

@@ -0,0 +1,151 @@
package cloudbrain

import (
"io/ioutil"
"os"
"sort"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
)

func ClearCloudbrainResultSpace() {
log.Info("clear cloudbrain one result space begin.")
if !setting.ClearStrategy.Enabled{
return
}

tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
return
}
debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
if err != nil {
log.Warn("Failed to get debug cloudbrain.", err)

}
tasks=append(tasks,debugTasks...)

if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
return
}
var ids []int64
for _, task := range tasks {
err := DeleteCloudbrainOneJobStorage(task.JobName)
if err == nil {
log.Info("clear job in cloudbrain table:"+task.JobName)
ids = append(ids, task.ID)
}
}

err = models.UpdateCloudBrainRecordsCleared(ids)
if err != nil {
log.Warn("Failed to set cloudbrain cleared status", err)
}
//如果云脑表处理完了,通过遍历minio对象处理历史垃圾数据,如果存在的话
if len(tasks) < setting.ClearStrategy.BatchSize+setting.ClearStrategy.DebugJobSize {
clearLocalHistoryTrashFile()
clearMinioHistoryTrashFile()

}
log.Info("clear cloudbrain one result space end.")

}

func clearMinioHistoryTrashFile() {
JobRealPrefix := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix

miniofiles, err := ioutil.ReadDir(JobRealPrefix)

processCount := 0
if err != nil {
log.Warn("Can not browser minio job path.")
} else {
SortModTimeAscend(miniofiles)
for _, file := range miniofiles {

if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {

has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has {
dirPath := setting.CBCodePathPrefix + file.Name() + "/"
log.Info("clear job in minio trash:" + file.Name())
storage.Attachments.DeleteDir(dirPath)
processCount++
}
if processCount == setting.ClearStrategy.BatchSize {
break
}
} else {
break
}

}

}
}

func clearLocalHistoryTrashFile() {
files, err := ioutil.ReadDir(setting.JobPath)
processCount := 0
if err != nil {
log.Warn("Can not browser local job path.")
} else {
SortModTimeAscend(files)
for _, file := range files {
//清理n天前的历史垃圾数据,清理job目录
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has{
os.RemoveAll(setting.JobPath + file.Name())
log.Info("clear job in local trash:"+file.Name())
processCount++
}
if processCount == setting.ClearStrategy.BatchSize {
break
}
} else {
break
}

}

}

}

func SortModTimeAscend(files []os.FileInfo) {
sort.Slice(files, func(i, j int) bool {
return files[i].ModTime().Before(files[j].ModTime())
})
}

func DeleteCloudbrainOneJobStorage(jobName string) error {

if jobName==""{
return nil
}
//delete local
localJobPath := setting.JobPath + jobName
err := os.RemoveAll(localJobPath)
if err != nil {
log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
}

dirPath := setting.CBCodePathPrefix + jobName + "/"
err1 := storage.Attachments.DeleteDir(dirPath)

if err1 != nil {
log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
}
if err == nil {
err = err1
}

return err
}

Loading…
Cancel
Save