You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.go 23 kB

3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. package cloudbrainTask
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "regexp"
  12. "strings"
  13. "code.gitea.io/gitea/modules/obs"
  14. "code.gitea.io/gitea/modules/git"
  15. "code.gitea.io/gitea/modules/storage"
  16. "github.com/unknwon/com"
  17. "code.gitea.io/gitea/models"
  18. "code.gitea.io/gitea/modules/cloudbrain"
  19. "code.gitea.io/gitea/modules/context"
  20. "code.gitea.io/gitea/modules/grampus"
  21. "code.gitea.io/gitea/modules/log"
  22. "code.gitea.io/gitea/modules/modelarts"
  23. "code.gitea.io/gitea/modules/redis/redis_key"
  24. "code.gitea.io/gitea/modules/redis/redis_lock"
  25. "code.gitea.io/gitea/modules/setting"
  26. api "code.gitea.io/gitea/modules/structs"
  27. "code.gitea.io/gitea/modules/util"
  28. "code.gitea.io/gitea/services/cloudbrain/resource"
  29. "code.gitea.io/gitea/services/reward/point/account"
  30. )
  31. var jobNamePattern = regexp.MustCompile(`^[a-z0-9][a-z0-9-_]{1,34}[a-z0-9-]$`)
  32. func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOption) {
  33. displayJobName := option.DisplayJobName
  34. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  35. uuid := option.Attachment
  36. description := option.Description
  37. bootFile := strings.TrimSpace(option.BootFile)
  38. params := option.Params
  39. repo := ctx.Repo.Repository
  40. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  41. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  42. branchName := option.BranchName
  43. image := strings.TrimSpace(option.Image)
  44. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  45. defer lock.UnLock()
  46. spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
  47. if err != nil {
  48. ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
  49. return
  50. }
  51. //prepare code and out path
  52. _, err = ioutil.ReadDir(codeLocalPath)
  53. if err == nil {
  54. os.RemoveAll(codeLocalPath)
  55. }
  56. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  57. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  58. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  59. }
  60. //todo: upload code (send to file_server todo this work?)
  61. //upload code
  62. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  63. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  64. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  65. return
  66. }
  67. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  68. if err := mkModelPath(modelPath); err != nil {
  69. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  70. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  71. return
  72. }
  73. //init model readme
  74. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  75. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  76. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  77. return
  78. }
  79. var datasetRemotePath, allFileName string
  80. for _, datasetInfo := range datasetInfos {
  81. if datasetRemotePath == "" {
  82. datasetRemotePath = datasetInfo.DataLocalPath
  83. allFileName = datasetInfo.FullName
  84. } else {
  85. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  86. allFileName = allFileName + ";" + datasetInfo.FullName
  87. }
  88. }
  89. //prepare command
  90. preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName)
  91. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, option.CkptName)
  92. if err != nil {
  93. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  94. ctx.JSON(http.StatusOK, models.BaseErrorMessage("Create task failed, internal error"))
  95. return
  96. }
  97. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  98. req := &grampus.GenerateTrainJobReq{
  99. JobName: jobName,
  100. DisplayJobName: displayJobName,
  101. ComputeResource: models.GPUResource,
  102. ProcessType: grampus.ProcessorTypeGPU,
  103. Command: command,
  104. ImageUrl: image,
  105. Description: description,
  106. BootFile: bootFile,
  107. Uuid: uuid,
  108. CommitID: commitID,
  109. BranchName: branchName,
  110. Params: option.Params,
  111. EngineName: image,
  112. DatasetNames: datasetNames,
  113. DatasetInfos: datasetInfos,
  114. IsLatestVersion: modelarts.IsLatestVersion,
  115. VersionCount: modelarts.VersionCountOne,
  116. WorkServerNumber: 1,
  117. Spec: spec,
  118. }
  119. if option.ModelName != "" { //使用预训练模型训练
  120. req.ModelName = option.ModelName
  121. req.LabelName = option.LabelName
  122. req.CkptName = option.CkptName
  123. req.ModelVersion = option.ModelVersion
  124. req.PreTrainModelUrl = option.PreTrainModelUrl
  125. }
  126. jobId, err := grampus.GenerateTrainJob(ctx, req)
  127. if err != nil {
  128. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  129. ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
  130. return
  131. }
  132. ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
  133. }
  134. func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock *redis_lock.DistributeLock, repo *models.Repository) (*models.Specification, map[string]models.DatasetInfo, string, error) {
  135. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  136. if !isOk {
  137. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  138. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_samejob_err"))
  139. }
  140. if !jobNamePattern.MatchString(option.DisplayJobName) {
  141. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_jobname_err"))
  142. }
  143. bootFileExist, err := ctx.Repo.FileExists(option.BootFile, option.BranchName)
  144. if err != nil || !bootFileExist {
  145. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  146. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_bootfile_err"))
  147. }
  148. //check count limit
  149. count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.GPUResource)
  150. if err != nil {
  151. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  152. return nil, nil, "", fmt.Errorf("system error")
  153. } else {
  154. if count >= 1 {
  155. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  156. return nil, nil, "", fmt.Errorf("you have already a running or waiting task, can not create more.")
  157. }
  158. }
  159. //check param
  160. if err := grampusParamCheckCreateTrainJob(option.BootFile, option.BranchName); err != nil {
  161. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  162. return nil, nil, "", err
  163. }
  164. //check whether the task name in the project is duplicated
  165. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), option.DisplayJobName)
  166. if err == nil {
  167. if len(tasks) != 0 {
  168. log.Error("the job name did already exist", ctx.Data["MsgID"])
  169. return nil, nil, "", fmt.Errorf("The job name did already exist.")
  170. }
  171. } else {
  172. if !models.IsErrJobNotExist(err) {
  173. log.Error("system error, %v", err, ctx.Data["MsgID"])
  174. return nil, nil, "", fmt.Errorf("system error")
  175. }
  176. }
  177. //check specification
  178. computeResource := models.GPU
  179. if option.Type == 3 {
  180. computeResource = models.NPU
  181. }
  182. spec, err := resource.GetAndCheckSpec(ctx.User.ID, option.SpecId, models.FindSpecsOptions{
  183. JobType: models.JobTypeTrain,
  184. ComputeResource: computeResource,
  185. Cluster: models.C2NetCluster,
  186. })
  187. if err != nil || spec == nil {
  188. return nil, nil, "", fmt.Errorf("Resource specification is not available.")
  189. }
  190. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  191. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  192. return nil, nil, "", fmt.Errorf(ctx.Tr("points.insufficient_points_balance"))
  193. }
  194. //check dataset
  195. datasetInfos, datasetNames, err := models.GetDatasetInfo(option.Attachment, computeResource)
  196. if err != nil {
  197. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  198. return nil, nil, "", fmt.Errorf(ctx.Tr("cloudbrain.error.dataset_select"))
  199. }
  200. return spec, datasetInfos, datasetNames, err
  201. }
  202. func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOption) {
  203. displayJobName := option.DisplayJobName
  204. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  205. uuid := option.Attachment
  206. description := option.Description
  207. bootFile := strings.TrimSpace(option.BootFile)
  208. params := option.Params
  209. repo := ctx.Repo.Repository
  210. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  211. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  212. branchName := option.BranchName
  213. isLatestVersion := modelarts.IsLatestVersion
  214. versionCount := modelarts.VersionCountOne
  215. engineName := option.Image
  216. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  217. defer lock.UnLock()
  218. spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
  219. if err != nil {
  220. ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
  221. return
  222. }
  223. //prepare code and out path
  224. _, err = ioutil.ReadDir(codeLocalPath)
  225. if err == nil {
  226. os.RemoveAll(codeLocalPath)
  227. }
  228. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  229. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  230. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  231. return
  232. }
  233. //todo: upload code (send to file_server todo this work?)
  234. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  235. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  236. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  237. return
  238. }
  239. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  240. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  241. ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
  242. return
  243. }
  244. var datasetRemotePath, allFileName string
  245. for _, datasetInfo := range datasetInfos {
  246. if datasetRemotePath == "" {
  247. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  248. allFileName = datasetInfo.FullName
  249. } else {
  250. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  251. allFileName = allFileName + ";" + datasetInfo.FullName
  252. }
  253. }
  254. //prepare command
  255. preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName)
  256. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, option.CkptName)
  257. if err != nil {
  258. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  259. ctx.JSON(http.StatusOK, models.BaseErrorMessage("Create task failed, internal error"))
  260. return
  261. }
  262. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  263. req := &grampus.GenerateTrainJobReq{
  264. JobName: jobName,
  265. DisplayJobName: displayJobName,
  266. ComputeResource: models.NPUResource,
  267. ProcessType: grampus.ProcessorTypeNPU,
  268. Command: command,
  269. ImageId: option.ImageID,
  270. Description: description,
  271. CodeObsPath: codeObsPath,
  272. BootFileUrl: codeObsPath + bootFile,
  273. BootFile: bootFile,
  274. WorkServerNumber: option.WorkServerNumber,
  275. Uuid: uuid,
  276. CommitID: commitID,
  277. IsLatestVersion: isLatestVersion,
  278. BranchName: branchName,
  279. Params: option.Params,
  280. EngineName: engineName,
  281. VersionCount: versionCount,
  282. TotalVersionCount: modelarts.TotalVersionCount,
  283. DatasetNames: datasetNames,
  284. DatasetInfos: datasetInfos,
  285. Spec: spec,
  286. CodeName: strings.ToLower(repo.Name),
  287. }
  288. if option.ModelName != "" { //使用预训练模型训练
  289. req.ModelName = option.ModelName
  290. req.LabelName = option.LabelName
  291. req.CkptName = option.CkptName
  292. req.ModelVersion = option.ModelVersion
  293. req.PreTrainModelUrl = option.PreTrainModelUrl
  294. req.PreTrainModelPath = preTrainModelPath
  295. }
  296. jobId, err := grampus.GenerateTrainJob(ctx, req)
  297. if err != nil {
  298. log.Error("GenerateTrainJob failed:%v", err.Error())
  299. ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
  300. return
  301. }
  302. ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
  303. }
  304. func obsMkdir(dir string) error {
  305. input := &obs.PutObjectInput{}
  306. input.Bucket = setting.Bucket
  307. input.Key = dir
  308. _, err := storage.ObsCli.PutObject(input)
  309. if err != nil {
  310. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  311. return err
  312. }
  313. return nil
  314. }
  315. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  316. files, err := readDir(codePath)
  317. if err != nil {
  318. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  319. return err
  320. }
  321. for _, file := range files {
  322. if file.IsDir() {
  323. input := &obs.PutObjectInput{}
  324. input.Bucket = setting.Bucket
  325. input.Key = parentDir + file.Name() + "/"
  326. _, err = storage.ObsCli.PutObject(input)
  327. if err != nil {
  328. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  329. return err
  330. }
  331. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  332. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  333. return err
  334. }
  335. } else {
  336. input := &obs.PutFileInput{}
  337. input.Bucket = setting.Bucket
  338. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  339. input.SourceFile = codePath + file.Name()
  340. _, err = storage.ObsCli.PutFile(input)
  341. if err != nil {
  342. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  343. return err
  344. }
  345. }
  346. }
  347. return nil
  348. }
  349. func grampusParamCheckCreateTrainJob(bootFile string, branchName string) error {
  350. if !strings.HasSuffix(strings.TrimSpace(bootFile), ".py") {
  351. log.Error("the boot file(%s) must be a python file", bootFile)
  352. return errors.New("启动文件必须是python文件")
  353. }
  354. if branchName == "" {
  355. log.Error("the branch must not be null!", branchName)
  356. return errors.New("代码分支不能为空!")
  357. }
  358. return nil
  359. }
  360. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  361. archiveType := git.ZIP
  362. archivePath := codePath
  363. if !com.IsDir(archivePath) {
  364. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  365. log.Error("MkdirAll failed:" + err.Error())
  366. return err
  367. }
  368. }
  369. // Get corresponding commit.
  370. var (
  371. commit *git.Commit
  372. err error
  373. )
  374. gitRepo := ctx.Repo.GitRepo
  375. if err != nil {
  376. log.Error("OpenRepository failed:" + err.Error())
  377. return err
  378. }
  379. if gitRepo.IsBranchExist(branchName) {
  380. commit, err = gitRepo.GetBranchCommit(branchName)
  381. if err != nil {
  382. log.Error("GetBranchCommit failed:" + err.Error())
  383. return err
  384. }
  385. } else {
  386. log.Error("the branch is not exist: " + branchName)
  387. return fmt.Errorf("The branch does not exist.")
  388. }
  389. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  390. if !com.IsFile(archivePath) {
  391. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  392. Format: archiveType,
  393. Prefix: setting.Repository.PrefixArchiveFiles,
  394. }); err != nil {
  395. log.Error("CreateArchive failed:" + err.Error())
  396. return err
  397. }
  398. }
  399. return nil
  400. }
  401. func uploadCodeToMinio(codePath, jobName, parentDir string) error {
  402. files, err := readDir(codePath)
  403. if err != nil {
  404. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  405. return err
  406. }
  407. for _, file := range files {
  408. if file.IsDir() {
  409. if err = uploadCodeToMinio(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  410. log.Error("uploadCodeToMinio(%s) failed: %s", file.Name(), err.Error())
  411. return err
  412. }
  413. } else {
  414. destObject := setting.CBCodePathPrefix + jobName + parentDir + file.Name()
  415. sourceFile := codePath + file.Name()
  416. err = storage.Attachments.UploadObject(destObject, sourceFile)
  417. if err != nil {
  418. log.Error("UploadObject(%s) failed: %s", file.Name(), err.Error())
  419. return err
  420. }
  421. }
  422. }
  423. return nil
  424. }
  425. func readDir(dirname string) ([]os.FileInfo, error) {
  426. f, err := os.Open(dirname)
  427. if err != nil {
  428. return nil, err
  429. }
  430. list, err := f.Readdir(0)
  431. f.Close()
  432. if err != nil {
  433. //todo: can not upload empty folder
  434. if err == io.EOF {
  435. return nil, nil
  436. }
  437. return nil, err
  438. }
  439. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  440. return list, nil
  441. }
  442. func mkModelPath(modelPath string) error {
  443. return mkPathAndReadMeFile(modelPath, "You can put the files into this directory and download the files by the web page.")
  444. }
  445. func mkPathAndReadMeFile(path string, text string) error {
  446. err := os.MkdirAll(path, os.ModePerm)
  447. if err != nil {
  448. log.Error("MkdirAll(%s) failed:%v", path, err)
  449. return err
  450. }
  451. fileName := path + "README"
  452. f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
  453. if err != nil {
  454. log.Error("OpenFile failed", err.Error())
  455. return err
  456. }
  457. defer f.Close()
  458. _, err = f.WriteString(text)
  459. if err != nil {
  460. log.Error("WriteString failed", err.Error())
  461. return err
  462. }
  463. return nil
  464. }
  465. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  466. index := strings.Index(pretrainModelDir, "/")
  467. if index > 0 {
  468. filterBucket := pretrainModelDir[index+1:]
  469. return filterBucket + fileName
  470. } else {
  471. return ""
  472. }
  473. }
  474. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) {
  475. var command string
  476. workDir := grampus.NpuWorkDir
  477. if processorType == grampus.ProcessorTypeGPU {
  478. workDir = grampus.GpuWorkDir
  479. }
  480. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  481. //download code & dataset
  482. if processorType == grampus.ProcessorTypeNPU {
  483. //no need to download code & dataset by internet
  484. } else if processorType == grampus.ProcessorTypeGPU {
  485. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  486. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  487. command += commandDownload
  488. }
  489. //unzip code & dataset
  490. if processorType == grampus.ProcessorTypeNPU {
  491. //no need to process
  492. } else if processorType == grampus.ProcessorTypeGPU {
  493. unZipDatasetCommand := generateDatasetUnzipCommand(datasetName)
  494. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  495. command += commandUnzip
  496. }
  497. command += "echo \"unzip finished;start to exec code;\";"
  498. // set export
  499. var commandExport string
  500. if processorType == grampus.ProcessorTypeNPU {
  501. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  502. } else if processorType == grampus.ProcessorTypeGPU {
  503. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  504. }
  505. command += commandExport
  506. //exec code
  507. var parameters models.Parameters
  508. var paramCode string
  509. if len(paramSrc) != 0 {
  510. err := json.Unmarshal([]byte(paramSrc), &parameters)
  511. if err != nil {
  512. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  513. return command, err
  514. }
  515. for _, parameter := range parameters.Parameter {
  516. paramCode += " --" + parameter.Label + "=" + parameter.Value
  517. }
  518. }
  519. var commandCode string
  520. if processorType == grampus.ProcessorTypeNPU {
  521. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py /tmp/log/train.log" + paramCode + ";"
  522. } else if processorType == grampus.ProcessorTypeGPU {
  523. if pretrainModelFileName != "" {
  524. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  525. }
  526. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  527. }
  528. command += commandCode
  529. //get exec result
  530. commandGetRes := "result=$?;"
  531. command += commandGetRes
  532. //upload models
  533. if processorType == grampus.ProcessorTypeNPU {
  534. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;"
  535. command += commandUpload
  536. } else if processorType == grampus.ProcessorTypeGPU {
  537. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  538. command += commandUpload
  539. }
  540. //check exec result
  541. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  542. command += commandCheckRes
  543. return command, nil
  544. }
  545. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  546. commandDownloadTemp := commandDownload
  547. if pretrainModelPath != "" {
  548. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  549. }
  550. commandDownloadTemp += ";"
  551. return commandDownloadTemp
  552. }
  553. func generateDatasetUnzipCommand(datasetName string) string {
  554. var unZipDatasetCommand string
  555. datasetNameArray := strings.Split(datasetName, ";")
  556. if len(datasetNameArray) == 1 { //单数据集
  557. unZipDatasetCommand = "unzip -q '" + datasetName + "';"
  558. if strings.HasSuffix(datasetNameArray[0], ".tar.gz") {
  559. unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';"
  560. }
  561. } else { //多数据集
  562. for _, datasetNameTemp := range datasetNameArray {
  563. if strings.HasSuffix(datasetNameTemp, ".tar.gz") {
  564. unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetNameTemp + "';"
  565. } else {
  566. unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';"
  567. }
  568. }
  569. }
  570. return unZipDatasetCommand
  571. }
  572. func getPoolId() string {
  573. var resourcePools modelarts.ResourcePool
  574. json.Unmarshal([]byte(setting.ResourcePools), &resourcePools)
  575. return resourcePools.Info[0].ID
  576. }