问题描述
以vasp为例,假定你有很多结构,在init_clusters里面:
init_clusters/N-10-sg-0.vasp init_clusters/N-12-sg-5.vasp init_clusters/N-15-sg-4.vasp init_clusters/N-18-sg-3.vasp
init_clusters/N-10-sg-1.vasp init_clusters/N-13-sg-0.vasp init_clusters/N-15-sg-5.vasp init_clusters/N-18-sg-4.vasp
init_clusters/N-10-sg-2.vasp init_clusters/N-13-sg-1.vasp init_clusters/N-16-sg-0.vasp init_clusters/N-18-sg-5.vasp
init_clusters/N-10-sg-3.vasp init_clusters/N-13-sg-2.vasp init_clusters/N-16-sg-1.vasp init_clusters/N-19-sg-0.vasp
init_clusters/N-10-sg-4.vasp init_clusters/N-13-sg-3.vasp init_clusters/N-16-sg-2.vasp init_clusters/N-19-sg-1.vasp
init_clusters/N-10-sg-5.vasp init_clusters/N-13-sg-4.vasp init_clusters/N-16-sg-3.vasp init_clusters/N-19-sg-2.vasp
init_clusters/N-11-sg-0.vasp init_clusters/N-13-sg-5.vasp init_clusters/N-16-sg-4.vasp init_clusters/N-19-sg-3.vasp
init_clusters/N-11-sg-1.vasp init_clusters/N-14-sg-0.vasp init_clusters/N-16-sg-5.vasp init_clusters/N-19-sg-4.vasp
init_clusters/N-11-sg-2.vasp init_clusters/N-14-sg-1.vasp init_clusters/N-17-sg-0.vasp init_clusters/N-19-sg-5.vasp
init_clusters/N-11-sg-3.vasp init_clusters/N-14-sg-2.vasp init_clusters/N-17-sg-1.vasp init_clusters/N-20-sg-0.vasp
init_clusters/N-11-sg-4.vasp init_clusters/N-14-sg-3.vasp init_clusters/N-17-sg-2.vasp init_clusters/N-20-sg-1.vasp
init_clusters/N-11-sg-5.vasp init_clusters/N-14-sg-4.vasp init_clusters/N-17-sg-3.vasp init_clusters/N-20-sg-2.vasp
init_clusters/N-12-sg-0.vasp init_clusters/N-14-sg-5.vasp init_clusters/N-17-sg-4.vasp init_clusters/N-20-sg-3.vasp
init_clusters/N-12-sg-1.vasp init_clusters/N-15-sg-0.vasp init_clusters/N-17-sg-5.vasp init_clusters/N-20-sg-4.vasp
init_clusters/N-12-sg-2.vasp init_clusters/N-15-sg-1.vasp init_clusters/N-18-sg-0.vasp init_clusters/N-20-sg-5.vasp
init_clusters/N-12-sg-3.vasp init_clusters/N-15-sg-2.vasp init_clusters/N-18-sg-1.vasp
init_clusters/N-12-sg-4.vasp init_clusters/N-15-sg-3.vasp init_clusters/N-18-sg-2.vasp
现在想要用相同的INCAR,POTCAR,KPOINTS提交任务(当然,这些也可以不一样,需要适当修改下面程序),计算完的任务全部放在fp文件夹(提前手动建立)里面。
实际上需要满足:
- 容错 custodian 可以自动解决vasp常见错误
- 重启 计算过的任务不会重新计算
- 远程提交 在本地可以提交的远程机器 (比如通过cl10向cl9提交任务)
- 队列系统 需要适用于常见的队列系统
运行脚本
runvasp.py
import os
import sys
import shutil
from glob import glob
from dpgen.remote.decide_machine import decide_fp_machine
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher
from monty.serialization import loadfn,dumpfn
def create_path (path,backup=False) :
if path[-1] != "/":
path += '/'
if os.path.isdir(path) :
if backup:
dirname = os.path.dirname(path)
counter = 0
while True :
bk_dirname = dirname + ".bk%06d" % counter
if not os.path.isdir(bk_dirname) :
shutil.move (dirname, bk_dirname)
break
counter += 1
os.makedirs (path)
return path
else:
return path
os.makedirs (path)
return path
#--------------------------------------------------
pwd=os.getcwd()
work_path = os.path.join(pwd,'fp')
fs=glob(os.path.join(pwd,'init_clusters','*.vasp'))
fs.sort()
for i in range(len(fs)):
task_name="task.%05d"%i
task_path=os.path.join(work_path,task_name)
create_path(task_path)
os.chdir(task_path)
shutil.copyfile(os.path.abspath(fs[i]), os.path.basename(fs[i]))
shutil.copyfile(os.path.basename(fs[i]),'POSCAR')
shutil.copyfile(os.path.join(pwd,'INCAR'),'INCAR')
shutil.copyfile(os.path.join(pwd,'POTCAR'),'POTCAR')
shutil.copyfile(os.path.join(pwd,'KPOINTS'),'KPOINTS')
os.chdir(pwd)
#os._exit(0)
fp_tasks = glob(os.path.join(work_path, 'task.*'))
fp_tasks.sort()
run_tasks = [os.path.basename(ii) for ii in fp_tasks]
#----------------------------------------------------
forward_files = ['POSCAR', 'INCAR', 'POTCAR','KPOINTS']
backward_files = ['OUTCAR','vasprun.xml','CONTCAR']
forward_common_files=[]
mark_failure =False
log_file='runlog'
err_file='errlog'
mdata=loadfn('machine.json')
mdata = decide_fp_machine(mdata)
#dumpfn(mdata,'new.json',indent=4)
fp_command = mdata['fp_command']
fp_group_size = mdata['fp_group_size']
#---------------------------------------------------
dispatcher = make_dispatcher(mdata['fp_machine'],
mdata_resource=mdata['fp_resources'],
work_path=work_path,
run_tasks=run_tasks,
group_size=fp_group_size)
dispatcher.run_jobs(mdata['fp_resources'],
[fp_command],
work_path,
run_tasks,
fp_group_size,
forward_common_files,
forward_files,
backward_files,
mark_failure=mark_failure,
outlog = log_file,
errlog = err_file)
机器配置文件
machine.json
可以参考dpgen-machine
这里可以配置任意数量的机器,下面一般本地slurm提交为例
{
"train": [
{
"machine": {
"batch": "slurm",
"hostname": "",
"port": 22,
"username": "haidi",
"work_path": "/fs0/home/haidi/work/Au/runvasp/work"
},
"resources": {
"numb_gpu": 1,
"numb_node": 1,
"task_per_node": 30,
"partition": "gpu4",
"exclude_list": [],
"_mem_limit": 28,
"source_list": [],
"module_list": [],
"task_max": 200,
"time_limit": "48:0:0"
},
"command": "/fs0/home/haidi/soft/deepmd-kit-gpu-1.2.0/bin/dp",
"group_size": 1
}
],
"model_devi": [
{
"machine": {
"batch": "slurm",
"hostname": "",
"port": 22,
"username": "haidi",
"work_path": "/fs0/home/haidi/work/Au/runvasp/work"
},
"resources": {
"numb_gpu": 1,
"numb_node": 1,
"task_per_node": 4,
"partition": "gpu4",
"exclude_list": [],
"_mem_limit": 28,
"source_list": [],
"module_list": [],
"task_max": 100,
"time_limit": "23:0:0",
"envs" : {
"TF_INTRA_OP_PARALLELISM_THREADS":1,
"TF_INTER_OP_PARALLELISM_THREADS":1,
"OMP_NUM_THREADS":4
}
},
"command": "/fs0/home/haidi/soft/deepmd-kit-gpu-1.2.0/bin/lmp",
"group_size": 10
}
],
"fp": [
{
"machine": {
"batch": "slurm",
"hostname": "",
"port": 22,
"username": "haidi",
"work_path": "/fs0/home/haidi/work/Au/runvasp/work"
},
"resources": {
"allow_failure": true,
"ratio_failue": 0.05,
"numb_node": 1,
"task_per_node": 40,
"partition": "cpu",
"with_mpi": false,
"task_max": 100,
"exclude_list": ["cu11","cu02"],
"time_limit": "48:0:0",
"source_list": ["/fs0/software/intel/2017u5/parallel_studio_xe_2017.5.061/psxevars.sh intel64"],
"envs" : {"PATH" : "/fs0/home/haidi/soft/vasp-5.4.4-2017u5/bin:$PATH" }
},
"command": "mpirun -np 40 vasp_gam",
"group_size": 1
}
]
}
文件目录
.
├── fp
├── INCAR
├── init_clusters
│ ├── N-10-sg-0.vasp
│ ├── N-10-sg-1.vasp
│ ├── N-10-sg-2.vasp
│ ├── N-10-sg-3.vasp
│ ├── N-10-sg-4.vasp
│ ├── N-10-sg-5.vasp
│ ├── N-11-sg-0.vasp
│ ├── N-11-sg-1.vasp
│ ├── N-11-sg-2.vasp
│ ├── N-11-sg-3.vasp
│ ├── N-11-sg-4.vasp
│ ├── N-11-sg-5.vasp
│ ├── N-12-sg-0.vasp
│ ├── N-12-sg-1.vasp
│ ├── N-12-sg-2.vasp
│ ├── N-12-sg-3.vasp
│ ├── N-12-sg-4.vasp
│ ├── N-12-sg-5.vasp
│ ├── N-13-sg-0.vasp
│ ├── N-13-sg-1.vasp
│ ├── N-13-sg-2.vasp
│ ├── N-13-sg-3.vasp
│ ├── N-13-sg-4.vasp
│ ├── N-13-sg-5.vasp
│ ├── N-14-sg-0.vasp
│ ├── N-14-sg-1.vasp
│ ├── N-14-sg-2.vasp
│ ├── N-14-sg-3.vasp
│ ├── N-14-sg-4.vasp
│ ├── N-14-sg-5.vasp
│ ├── N-15-sg-0.vasp
│ ├── N-15-sg-1.vasp
│ ├── N-15-sg-2.vasp
│ ├── N-15-sg-3.vasp
│ ├── N-15-sg-4.vasp
│ ├── N-15-sg-5.vasp
│ ├── N-16-sg-0.vasp
│ ├── N-16-sg-1.vasp
│ ├── N-16-sg-2.vasp
│ ├── N-16-sg-3.vasp
│ ├── N-16-sg-4.vasp
│ ├── N-16-sg-5.vasp
│ ├── N-17-sg-0.vasp
│ ├── N-17-sg-1.vasp
│ ├── N-17-sg-2.vasp
│ ├── N-17-sg-3.vasp
│ ├── N-17-sg-4.vasp
│ ├── N-17-sg-5.vasp
│ ├── N-18-sg-0.vasp
│ ├── N-18-sg-1.vasp
│ ├── N-18-sg-2.vasp
│ ├── N-18-sg-3.vasp
│ ├── N-18-sg-4.vasp
│ ├── N-18-sg-5.vasp
│ ├── N-19-sg-0.vasp
│ ├── N-19-sg-1.vasp
│ ├── N-19-sg-2.vasp
│ ├── N-19-sg-3.vasp
│ ├── N-19-sg-4.vasp
│ ├── N-19-sg-5.vasp
│ ├── N-20-sg-0.vasp
│ ├── N-20-sg-1.vasp
│ ├── N-20-sg-2.vasp
│ ├── N-20-sg-3.vasp
│ ├── N-20-sg-4.vasp
│ └── N-20-sg-5.vasp
├── machine.json
├── POTCAR
├── KPOINTS
└── runvasp.py
运行原理
通过python paromiko软件打包本地文件到本地计算文件夹、或者远程机器计算文件夹,定时查看远程机器的计算状态,等任务计算完成之后,收回文件backward_files 列表
运行方式
nohup python -u runvasp.py &
运行结果
jr.json记录提交信息,用于重启,放置已经提交或者已经算完的任务再次被提交
task.* 实际的计算任务
jr.json task.00006 task.00013 task.00020 task.00027 task.00034 task.00041 task.00048 task.00055 task.00062
task.00000 task.00007 task.00014 task.00021 task.00028 task.00035 task.00042 task.00049 task.00056 task.00063
task.00001 task.00008 task.00015 task.00022 task.00029 task.00036 task.00043 task.00050 task.00057 task.00064
task.00002 task.00009 task.00016 task.00023 task.00030 task.00037 task.00044 task.00051 task.00058 task.00065
task.00003 task.00010 task.00017 task.00024 task.00031 task.00038 task.00045 task.00052 task.00059
task.00004 task.00011 task.00018 task.00025 task.00032 task.00039 task.00046 task.00053 task.00060
task.00005 task.00012 task.00019 task.00026 task.00033 task.00040 task.00047 task.00054 task.00061