VASP高通量计算


问题描述

以vasp为例,假定你有很多结构,在init_clusters里面:

init_clusters/N-10-sg-0.vasp  init_clusters/N-12-sg-5.vasp  init_clusters/N-15-sg-4.vasp  init_clusters/N-18-sg-3.vasp
init_clusters/N-10-sg-1.vasp  init_clusters/N-13-sg-0.vasp  init_clusters/N-15-sg-5.vasp  init_clusters/N-18-sg-4.vasp
init_clusters/N-10-sg-2.vasp  init_clusters/N-13-sg-1.vasp  init_clusters/N-16-sg-0.vasp  init_clusters/N-18-sg-5.vasp
init_clusters/N-10-sg-3.vasp  init_clusters/N-13-sg-2.vasp  init_clusters/N-16-sg-1.vasp  init_clusters/N-19-sg-0.vasp
init_clusters/N-10-sg-4.vasp  init_clusters/N-13-sg-3.vasp  init_clusters/N-16-sg-2.vasp  init_clusters/N-19-sg-1.vasp
init_clusters/N-10-sg-5.vasp  init_clusters/N-13-sg-4.vasp  init_clusters/N-16-sg-3.vasp  init_clusters/N-19-sg-2.vasp
init_clusters/N-11-sg-0.vasp  init_clusters/N-13-sg-5.vasp  init_clusters/N-16-sg-4.vasp  init_clusters/N-19-sg-3.vasp
init_clusters/N-11-sg-1.vasp  init_clusters/N-14-sg-0.vasp  init_clusters/N-16-sg-5.vasp  init_clusters/N-19-sg-4.vasp
init_clusters/N-11-sg-2.vasp  init_clusters/N-14-sg-1.vasp  init_clusters/N-17-sg-0.vasp  init_clusters/N-19-sg-5.vasp
init_clusters/N-11-sg-3.vasp  init_clusters/N-14-sg-2.vasp  init_clusters/N-17-sg-1.vasp  init_clusters/N-20-sg-0.vasp
init_clusters/N-11-sg-4.vasp  init_clusters/N-14-sg-3.vasp  init_clusters/N-17-sg-2.vasp  init_clusters/N-20-sg-1.vasp
init_clusters/N-11-sg-5.vasp  init_clusters/N-14-sg-4.vasp  init_clusters/N-17-sg-3.vasp  init_clusters/N-20-sg-2.vasp
init_clusters/N-12-sg-0.vasp  init_clusters/N-14-sg-5.vasp  init_clusters/N-17-sg-4.vasp  init_clusters/N-20-sg-3.vasp
init_clusters/N-12-sg-1.vasp  init_clusters/N-15-sg-0.vasp  init_clusters/N-17-sg-5.vasp  init_clusters/N-20-sg-4.vasp
init_clusters/N-12-sg-2.vasp  init_clusters/N-15-sg-1.vasp  init_clusters/N-18-sg-0.vasp  init_clusters/N-20-sg-5.vasp
init_clusters/N-12-sg-3.vasp  init_clusters/N-15-sg-2.vasp  init_clusters/N-18-sg-1.vasp
init_clusters/N-12-sg-4.vasp  init_clusters/N-15-sg-3.vasp  init_clusters/N-18-sg-2.vasp

现在想要用相同的INCAR,POTCAR,KPOINTS提交任务(当然,这些也可以不一样,需要适当修改下面程序),计算完的任务全部放在fp文件夹(提前手动建立)里面。


实际上需要满足:

  • 容错 custodian 可以自动解决vasp常见错误
  • 重启 计算过的任务不会重新计算
  • 远程提交 在本地可以提交的远程机器 (比如通过cl10向cl9提交任务)
  • 队列系统 需要适用于常见的队列系统

运行脚本

runvasp.py


import os
import sys
import shutil
from glob import glob
from dpgen.remote.decide_machine import decide_fp_machine
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher
from monty.serialization import loadfn,dumpfn

def create_path (path,backup=False) :
    if  path[-1] != "/":
        path += '/'
    if os.path.isdir(path) :
        if backup:
           dirname = os.path.dirname(path)
           counter = 0
           while True :
               bk_dirname = dirname + ".bk%06d" % counter
               if not os.path.isdir(bk_dirname) :
                   shutil.move (dirname, bk_dirname)
                   break
               counter += 1
           os.makedirs (path)
           return path
        else:
           return path

    os.makedirs (path)
    return path
#--------------------------------------------------
pwd=os.getcwd()
work_path = os.path.join(pwd,'fp')

fs=glob(os.path.join(pwd,'init_clusters','*.vasp'))
fs.sort()

for i in range(len(fs)):
    task_name="task.%05d"%i
    task_path=os.path.join(work_path,task_name)
    create_path(task_path)
    os.chdir(task_path)
    shutil.copyfile(os.path.abspath(fs[i]), os.path.basename(fs[i]))
    shutil.copyfile(os.path.basename(fs[i]),'POSCAR')
    shutil.copyfile(os.path.join(pwd,'INCAR'),'INCAR')
    shutil.copyfile(os.path.join(pwd,'POTCAR'),'POTCAR')
    shutil.copyfile(os.path.join(pwd,'KPOINTS'),'KPOINTS')
os.chdir(pwd)
#os._exit(0)
fp_tasks = glob(os.path.join(work_path, 'task.*'))
fp_tasks.sort()
run_tasks = [os.path.basename(ii) for ii in fp_tasks]
#----------------------------------------------------
forward_files = ['POSCAR', 'INCAR', 'POTCAR','KPOINTS']
backward_files = ['OUTCAR','vasprun.xml','CONTCAR']
forward_common_files=[]
mark_failure =False
log_file='runlog'
err_file='errlog'
mdata=loadfn('machine.json')
mdata  = decide_fp_machine(mdata)
#dumpfn(mdata,'new.json',indent=4)
fp_command = mdata['fp_command']
fp_group_size = mdata['fp_group_size']
#---------------------------------------------------


dispatcher = make_dispatcher(mdata['fp_machine'],
                             mdata_resource=mdata['fp_resources'],
                             work_path=work_path,
                             run_tasks=run_tasks,
                             group_size=fp_group_size)
dispatcher.run_jobs(mdata['fp_resources'],
                        [fp_command],
                        work_path,
                        run_tasks,
                        fp_group_size,
                        forward_common_files,
                        forward_files,
                        backward_files,
                        mark_failure=mark_failure,
                        outlog = log_file,
                        errlog = err_file)

机器配置文件

machine.json
可以参考dpgen-machine
这里可以配置任意数量的机器,下面一般本地slurm提交为例

{
  "train": [
    {
      "machine": {
        "batch": "slurm",
        "hostname": "",
        "port": 22,
        "username": "haidi",
        "work_path": "/fs0/home/haidi/work/Au/runvasp/work"
      },
      "resources": {
        "numb_gpu": 1,
        "numb_node": 1,
        "task_per_node": 30,
        "partition": "gpu4",
        "exclude_list": [],
        "_mem_limit": 28,
        "source_list": [],
        "module_list": [],
        "task_max": 200,
        "time_limit": "48:0:0"
      },
      "command": "/fs0/home/haidi/soft/deepmd-kit-gpu-1.2.0/bin/dp",
      "group_size": 1
    }
  ],

  "model_devi": [
    {
      "machine": {
        "batch": "slurm",
        "hostname": "",
        "port": 22,
        "username": "haidi",
        "work_path": "/fs0/home/haidi/work/Au/runvasp/work"
      },
      "resources": {
        "numb_gpu": 1,
        "numb_node": 1,
        "task_per_node": 4,
        "partition": "gpu4",
        "exclude_list": [],
        "_mem_limit": 28,
        "source_list": [],
        "module_list": [],
        "task_max": 100,
        "time_limit": "23:0:0",
        "envs" : {
                  "TF_INTRA_OP_PARALLELISM_THREADS":1,
                  "TF_INTER_OP_PARALLELISM_THREADS":1,
                  "OMP_NUM_THREADS":4
                }
      },
      "command": "/fs0/home/haidi/soft/deepmd-kit-gpu-1.2.0/bin/lmp",
      "group_size": 10
    }
  ],

  "fp": [
    {
      "machine": {
        "batch": "slurm",
        "hostname": "",
        "port": 22,
        "username": "haidi",
        "work_path": "/fs0/home/haidi/work/Au/runvasp/work"
      },
      "resources": {
        "allow_failure": true,
        "ratio_failue": 0.05,
        "numb_node": 1,
        "task_per_node": 40,
        "partition": "cpu",
        "with_mpi": false,
        "task_max": 100,
        "exclude_list": ["cu11","cu02"],
        "time_limit": "48:0:0",
        "source_list": ["/fs0/software/intel/2017u5/parallel_studio_xe_2017.5.061/psxevars.sh intel64"],
        "envs" : {"PATH" : "/fs0/home/haidi/soft/vasp-5.4.4-2017u5/bin:$PATH"   }
      },
      "command": "mpirun -np 40 vasp_gam",
      "group_size": 1
    }
  ]
}

文件目录

.
├── fp
├── INCAR
├── init_clusters
│   ├── N-10-sg-0.vasp
│   ├── N-10-sg-1.vasp
│   ├── N-10-sg-2.vasp
│   ├── N-10-sg-3.vasp
│   ├── N-10-sg-4.vasp
│   ├── N-10-sg-5.vasp
│   ├── N-11-sg-0.vasp
│   ├── N-11-sg-1.vasp
│   ├── N-11-sg-2.vasp
│   ├── N-11-sg-3.vasp
│   ├── N-11-sg-4.vasp
│   ├── N-11-sg-5.vasp
│   ├── N-12-sg-0.vasp
│   ├── N-12-sg-1.vasp
│   ├── N-12-sg-2.vasp
│   ├── N-12-sg-3.vasp
│   ├── N-12-sg-4.vasp
│   ├── N-12-sg-5.vasp
│   ├── N-13-sg-0.vasp
│   ├── N-13-sg-1.vasp
│   ├── N-13-sg-2.vasp
│   ├── N-13-sg-3.vasp
│   ├── N-13-sg-4.vasp
│   ├── N-13-sg-5.vasp
│   ├── N-14-sg-0.vasp
│   ├── N-14-sg-1.vasp
│   ├── N-14-sg-2.vasp
│   ├── N-14-sg-3.vasp
│   ├── N-14-sg-4.vasp
│   ├── N-14-sg-5.vasp
│   ├── N-15-sg-0.vasp
│   ├── N-15-sg-1.vasp
│   ├── N-15-sg-2.vasp
│   ├── N-15-sg-3.vasp
│   ├── N-15-sg-4.vasp
│   ├── N-15-sg-5.vasp
│   ├── N-16-sg-0.vasp
│   ├── N-16-sg-1.vasp
│   ├── N-16-sg-2.vasp
│   ├── N-16-sg-3.vasp
│   ├── N-16-sg-4.vasp
│   ├── N-16-sg-5.vasp
│   ├── N-17-sg-0.vasp
│   ├── N-17-sg-1.vasp
│   ├── N-17-sg-2.vasp
│   ├── N-17-sg-3.vasp
│   ├── N-17-sg-4.vasp
│   ├── N-17-sg-5.vasp
│   ├── N-18-sg-0.vasp
│   ├── N-18-sg-1.vasp
│   ├── N-18-sg-2.vasp
│   ├── N-18-sg-3.vasp
│   ├── N-18-sg-4.vasp
│   ├── N-18-sg-5.vasp
│   ├── N-19-sg-0.vasp
│   ├── N-19-sg-1.vasp
│   ├── N-19-sg-2.vasp
│   ├── N-19-sg-3.vasp
│   ├── N-19-sg-4.vasp
│   ├── N-19-sg-5.vasp
│   ├── N-20-sg-0.vasp
│   ├── N-20-sg-1.vasp
│   ├── N-20-sg-2.vasp
│   ├── N-20-sg-3.vasp
│   ├── N-20-sg-4.vasp
│   └── N-20-sg-5.vasp
├── machine.json
├── POTCAR
├── KPOINTS
└── runvasp.py

运行原理

通过python paromiko软件打包本地文件到本地计算文件夹、或者远程机器计算文件夹,定时查看远程机器的计算状态,等任务计算完成之后,收回文件backward_files 列表

运行方式

nohup python -u runvasp.py &

运行结果

jr.json记录提交信息,用于重启,放置已经提交或者已经算完的任务再次被提交

task.* 实际的计算任务

jr.json     task.00006  task.00013  task.00020  task.00027  task.00034  task.00041  task.00048  task.00055  task.00062
task.00000  task.00007  task.00014  task.00021  task.00028  task.00035  task.00042  task.00049  task.00056  task.00063
task.00001  task.00008  task.00015  task.00022  task.00029  task.00036  task.00043  task.00050  task.00057  task.00064
task.00002  task.00009  task.00016  task.00023  task.00030  task.00037  task.00044  task.00051  task.00058  task.00065
task.00003  task.00010  task.00017  task.00024  task.00031  task.00038  task.00045  task.00052  task.00059
task.00004  task.00011  task.00018  task.00025  task.00032  task.00039  task.00046  task.00053  task.00060
task.00005  task.00012  task.00019  task.00026  task.00033  task.00040  task.00047  task.00054  task.00061

文章作者: ustc-haidi
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 ustc-haidi !
  目录