Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions arc/job/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@

constraint_type_dict = {2: 'B', 3: 'A', 4: 'D'}

# ARC keeps job-memory math in base-2 units internally. In other words, ARC's
# "GB" behaves as GiB and is converted using 1 GiB = 1024 MiB. This is
# deliberate: many chemistry codes and cluster templates ultimately consume an
# integer "MB"-style value, and using a consistent binary convention avoids
# mixing 1000- and 1024-based conversions in different parts of the pipeline.
# Human-facing decimal capacities are smaller when expressed in base-2, e.g.
# 10 GB (decimal) ~= 9.31 GiB. ARC therefore interprets job_memory_gb=10 as
# 10 GiB, not 10 decimal GB.
MEMORY_GB_TO_MIB = 1024
DEFAULT_JOB_MEMORY_OVERHEAD = 1.10
CAPPED_JOB_MEMORY_OVERHEAD = 1.05


class JobEnum(str, Enum):
"""
Expand Down Expand Up @@ -581,21 +593,25 @@ def set_cpu_and_mem(self):
f'exceeds {100 * job_max_server_node_memory_allocation}% of the the maximum node memory on '
f'{self.server}. Setting it to {job_max_server_node_memory_allocation * max_mem:.2f} GB.')
self.job_memory_gb = job_max_server_node_memory_allocation * max_mem
total_submit_script_memory = self.job_memory_gb * 1024 * 1.05 # MB
total_submit_script_memory_mib = math.ceil(self.job_memory_gb * MEMORY_GB_TO_MIB * CAPPED_JOB_MEMORY_OVERHEAD)
self.job_status[1]['keywords'].append('max_total_job_memory') # Useful info when troubleshooting.
else:
total_submit_script_memory = self.job_memory_gb * 1024 * 1.1 # MB
total_submit_script_memory_mib = math.ceil(self.job_memory_gb * MEMORY_GB_TO_MIB * DEFAULT_JOB_MEMORY_OVERHEAD)
self.submit_script_memory_mib = total_submit_script_memory_mib
# Determine amount of memory in submit script based on cluster job scheduling system.
cluster_software = servers[self.server].get('cluster_soft').lower() if self.server is not None else None
if cluster_software in ['oge', 'sge', 'htcondor']:
# In SGE, "-l h_vmem=5000M" specifies the memory for all cores to be 5000 MB.
self.submit_script_memory = math.ceil(total_submit_script_memory) # in MB
# ARC uses MiB internally and passes that integer consistently to scheduler templates.
self.submit_script_memory = total_submit_script_memory_mib
if cluster_software in ['pbs']:
# In PBS, "#PBS -l select=1:ncpus=8:mem=12000000" specifies the memory for all cores to be 12 MB.
self.submit_script_memory = math.ceil(total_submit_script_memory) * 1E6 # in Bytes
# ARC keeps the PBS request in MiB as well. The template still uses
# an "mb" suffix, but the integer is derived from the same base-2
# MiB count used everywhere else in ARC.
self.submit_script_memory = total_submit_script_memory_mib
elif cluster_software in ['slurm']:
# In Slurm, "#SBATCH --mem-per-cpu=2000" specifies the memory **per cpu/thread** to be 2000 MB.
self.submit_script_memory = math.ceil(total_submit_script_memory / self.cpu_cores) # in MB
# In Slurm, "#SBATCH --mem-per-cpu=2000" is a per-core request, so
# we divide ARC's total MiB budget across the requested cores.
self.submit_script_memory = math.ceil(total_submit_script_memory_mib / self.cpu_cores)
self.set_input_file_memory()

def as_dict(self) -> dict:
Expand Down
2 changes: 1 addition & 1 deletion arc/job/adapter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def test_set_cpu_and_mem(self):
self.job_4.server = 'server3'
self.job_4.cpu_cores = None
self.job_4.set_cpu_and_mem()
expected_memory = math.ceil(14 * 1024 * 1.1) * 1E6
expected_memory = math.ceil(14 * 1024 * 1.1)
self.assertEqual(self.job_4.submit_script_memory, expected_memory)
self.job_4.server = 'local'

Expand Down
1 change: 1 addition & 0 deletions arc/job/adapters/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def _initialize_adapter(obj: 'JobAdapter',
obj.server_nodes = server_nodes or list()
obj.species = [species] if species is not None and not isinstance(species, list) else species
obj.submit_script_memory = None
obj.submit_script_memory_mib = None
obj.testing = testing
obj.times_rerun = times_rerun
obj.torsions = [torsions] if torsions is not None and not isinstance(torsions[0], list) else torsions
Expand Down
16 changes: 14 additions & 2 deletions arc/job/adapters/gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
settings['default_job_settings'], settings['global_ess_settings'], settings['input_filenames'], \
settings['output_filenames'], settings['servers'], settings['submit_filenames']

# Gaussian should not consume the entire scheduler allocation. ARC reserves a
# fixed fraction of the submit-script memory for non-Gaussian overhead such as
# the scheduler, runtime, scratch bookkeeping, and Gaussian allocations outside
# the explicit %mem budget.
GAUSSIAN_MEMORY_HEADROOM_FRACTION = 0.90


# job_type_1: '' for sp, irc, or composite methods, 'opt=calcfc', 'opt=(calcfc,ts,noeigen)',
# job_type_2: '' or 'freq iop(7/33=1)' (cannot be combined with CBS-QB3)
Expand Down Expand Up @@ -493,8 +499,14 @@ def set_input_file_memory(self) -> None:
"""
Set the input_file_memory attribute.
"""
# Gaussian's memory is in MB, total for all cpu cores
self.input_file_memory = math.ceil(self.job_memory_gb * 1024)
# Gaussian's %mem is the total memory budget for the process. ARC keeps
# scheduler memory in MiB and intentionally gives Gaussian only part of
# that total so the queue allocation retains headroom. This matters most
# on capped nodes: e.g., a human "10 GB" node is only ~9.31 GiB, and if
# ARC already requests ~95% of a node, passing the entire allocation to
# %mem leaves too little room for runtime overhead and can trigger galloc.
submit_script_memory_mib = self.submit_script_memory_mib or math.ceil(self.job_memory_gb * 1024)
self.input_file_memory = max(1, math.floor(submit_script_memory_mib * GAUSSIAN_MEMORY_HEADROOM_FRACTION))

def execute_incore(self):
"""
Expand Down
57 changes: 30 additions & 27 deletions arc/job/adapters/gaussian_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,23 +515,26 @@ def test_set_cpu_and_mem(self):
"""Test assigning number of cpu's and memory"""
self.job_8.input_file_memory = None
self.job_8.submit_script_memory = None
self.job_8.submit_script_memory_mib = None
self.job_8.server = 'server2'
self.job_8.set_cpu_and_mem()
self.assertEqual(self.job_8.cpu_cores, 8)
self.assertEqual(self.job_8.submit_script_memory_mib, math.ceil(14 * 1024 * 1.1))
self.assertLess(self.job_8.input_file_memory, self.job_8.submit_script_memory_mib)

def test_set_input_file_memory(self):
"""Test setting the input_file_memory argument"""
expected_memory = math.ceil(14 * 1024)
expected_memory = math.floor(math.ceil(14 * 1024 * 1.1) * 0.9)
self.assertEqual(self.job_1.input_file_memory, expected_memory)
self.assertEqual(self.job_2.input_file_memory, 14336)
self.assertEqual(self.job_2.input_file_memory, expected_memory)

def test_write_input_file_multi(self):
"""Test writing Gaussian input files"""
self.job_multi.write_input_file()
with open(os.path.join(self.job_multi.local_path, input_filenames[self.job_multi.job_adapter]), 'r') as f:
content_multi = f.read()
job_multi_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000)
Expand All @@ -545,7 +548,7 @@ def test_write_input_file_multi(self):

--link1--
%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000)
Expand All @@ -559,7 +562,7 @@ def test_write_input_file_multi(self):

--link1--
%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) SCRF=(smd, Solvent=water) wb97xd/def2tzvp IOp(2/9=2000)
Expand Down Expand Up @@ -588,7 +591,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_1.local_path, input_filenames[self.job_1.job_adapter]), 'r') as f:
content_1 = f.read()
job_1_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) cbs-qb3 IOp(2/9=2000) IOp(1/12=5,3/44=0)
Expand All @@ -606,7 +609,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_3.local_path, input_filenames[self.job_3.job_adapter]), 'r') as f:
content_3 = f.read()
job_3_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000)
Expand All @@ -624,7 +627,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_4.local_path, input_filenames[self.job_4.job_adapter]), 'r') as f:
content_4 = f.read()
job_4_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxStep=5,modredundant,noeigentest) integral=(grid=ultrafine, Acc2E=12) guess=mix wb97xd/def2tzvp IOp(2/9=2000) scf=(direct,tight)
Expand Down Expand Up @@ -657,7 +660,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_5.local_path, input_filenames[self.job_5.job_adapter]), 'r') as f:
content_5 = f.read()
job_5_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P uwb97xd/def2tzvp freq IOp(7/33=1) integral=(grid=ultrafine, Acc2E=12) IOp(2/9=2000) scf=(direct,tight)
Expand All @@ -675,7 +678,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_6.local_path, input_filenames[self.job_6.job_adapter]), 'r') as f:
content_6 = f.read()
job_6_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc) uwb97xd/def2tzvp IOp(2/9=2000)
Expand All @@ -693,7 +696,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_7.local_path, input_filenames[self.job_7.job_adapter]), 'r') as f:
content_7 = f.read()
job_7_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P irc=(CalcAll,maxpoints=50,reverse,stepsize=7) uwb97xd/def2tzvp IOp(2/9=2000)
Expand All @@ -711,7 +714,7 @@ def test_write_input_file(self):
with open(os.path.join(self.job_opt_uff.local_path, input_filenames[self.job_opt_uff.job_adapter]), 'r') as f:
content_opt_uff = f.read()
job_opt_uff_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt uff IOp(2/9=2000)
Expand Down Expand Up @@ -776,7 +779,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_10.local_path, input_filenames[self.job_10.job_adapter]), 'r') as f:
content_10 = f.read()
job_10_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) uwb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight)
Expand All @@ -794,7 +797,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_11.local_path, input_filenames[self.job_11.job_adapter]), 'r') as f:
content_11 = f.read()
job_11_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight)
Expand All @@ -820,7 +823,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_12.local_path, input_filenames[self.job_12.job_adapter]), 'r') as f:
content_12 = f.read()
job_12_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(direct,tight,xqc)
Expand All @@ -846,7 +849,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_13.local_path, input_filenames[self.job_13.job_adapter]), 'r') as f:
content_13 = f.read()
job_13_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,direct,tight,xqc)
Expand All @@ -872,7 +875,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_14.local_path, input_filenames[self.job_14.job_adapter]), 'r') as f:
content_14 = f.read()
job_14_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,NoDIIS,direct,tight,xqc)
Expand All @@ -898,7 +901,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_15.local_path, input_filenames[self.job_15.job_adapter]), 'r') as f:
content_15 = f.read()
job_15_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,cartesian,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,NoDIIS,direct,tight,xqc)
Expand All @@ -925,7 +928,7 @@ def test_trsh_write_input_file(self):
content_16 = f.read()

job_16_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(cartesian) integral=(grid=ultrafine, Acc2E=14) guess=INDO wb97xd IOp(2/9=2000) nosymm scf=(Fermi,NDamp=30,NoDIIS,NoVarAcc,Noincfock,direct,tight,xqc)
Expand All @@ -952,7 +955,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_17.local_path, input_filenames[self.job_17.job_adapter]), 'r') as f:
content_17 = f.read()
job_17_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight,xqc)
Expand All @@ -978,7 +981,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_18.local_path, input_filenames[self.job_18.job_adapter]), 'r') as f:
content_18 = f.read()
job_18_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) int=grid=300590 scf=(direct,tight)
Expand All @@ -1004,7 +1007,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_19.local_path, input_filenames[self.job_19.job_adapter]), 'r') as f:
content_19 = f.read()
job_19_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(Fermi,NDamp=30,NoDIIS,NoVarAcc,Noincfock,direct,tight,xqc)
Expand All @@ -1030,7 +1033,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_20.local_path, input_filenames[self.job_20.job_adapter]), 'r') as f:
content_20 = f.read()
job_20_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(NDamp=30,NoDIIS,NoVarAcc,direct,tight,xqc)
Expand All @@ -1057,7 +1060,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_21.local_path, input_filenames[self.job_21.job_adapter]), 'r') as f:
content_21 = f.read()
job_21_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=INDO wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) int=grid=300590 scf=(NDamp=30,NoDIIS,NoVarAcc,direct,tight,xqc)
Expand All @@ -1084,7 +1087,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_22.local_path, input_filenames[self.job_22.job_adapter]), 'r') as f:
content_22 = f.read()
job_22_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight)
Expand All @@ -1111,7 +1114,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_23.local_path, input_filenames[self.job_23.job_adapter]), 'r') as f:
content_23 = f.read()
job_23_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(RFO,calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight)
Expand All @@ -1138,7 +1141,7 @@ def test_trsh_write_input_file(self):
with open(os.path.join(self.job_24.local_path, input_filenames[self.job_24.job_adapter]), 'r') as f:
content_24 = f.read()
job_24_expected_input_file = """%chk=check.chk
%mem=14336mb
%mem=14193mb
%NProcShared=8

#P opt=(GDIIS,calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight)
Expand Down
23 changes: 19 additions & 4 deletions arc/job/trsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@


delete_command, inconsistency_ab, inconsistency_az, maximum_barrier, preserve_params_in_scan, rotor_scan_resolution, \
servers, submit_filenames = settings['delete_command'], settings['inconsistency_ab'], settings['inconsistency_az'], \
settings['maximum_barrier'], settings['preserve_params_in_scan'], \
settings['rotor_scan_resolution'], settings['servers'], settings['submit_filenames']
servers, submit_filenames, default_job_settings = settings['delete_command'], settings['inconsistency_ab'], \
settings['inconsistency_az'], settings['maximum_barrier'], \
settings['preserve_params_in_scan'], \
settings['rotor_scan_resolution'], settings['servers'], \
settings['submit_filenames'], settings['default_job_settings']


def determine_ess_status(output_path: str,
Expand Down Expand Up @@ -980,11 +982,24 @@ def trsh_ess_job(label: str,
# Increase memory allocation
couldnt_trsh = False
max_mem = servers[server].get('memory', 128) # Node memory in GB, defaults to 128 if not specified
memory = min(memory_gb * 2, max_mem * 0.95)
max_mem_allocation = max_mem * default_job_settings.get('job_max_server_node_memory_allocation', 0.95)
memory = min(memory_gb * 2, max_mem_allocation)
if memory > memory_gb:
logger.info(f'Troubleshooting {job_type} job in {software} for {label} using more memory: {memory} GB '
f'instead of {memory_gb} GB')
ess_trsh_methods.append('memory')
else:
couldnt_trsh = True
output_errors.append(
f'Error: Could not troubleshoot {job_type} for {label}! Gaussian exhausted memory even after ARC '
f'reached the configured node-memory cap ({max_mem_allocation:.2f} GB total allocation) while '
f'still reserving scheduler headroom. Use a higher-memory node or lower the job cost; '
)
logger.error(
f'Could not troubleshoot {job_type} job in {software} for {label}. ARC already reached the '
f'configured node-memory cap ({max_mem_allocation:.2f} GB total allocation) and still preserved '
f'Gaussian headroom.'
)

if attempted_ess_trsh_methods:
if attempted_ess_trsh_methods == ess_trsh_methods:
Expand Down
Loading
Loading