Using bwUniCluster¶
Login¶
There are 4 login nodes and 2 gateways that redirect to any of the login nodes in a load-balanced way:
Hostname |
Node type |
---|---|
|
login to one of the four login nodes |
|
login to one of the four login nodes |
The login nodes can also be reached directly:
Hostname |
Node type |
---|---|
|
bwUniCluster 2.0 first login node |
|
bwUniCluster 2.0 second login node |
|
bwUniCluster 2.0 third login node |
|
bwUniCluster 2.0 fourth login node |
Host key fingerprint:
Algorithm |
Fingerprint (SHA256) |
---|---|
RSA |
|
ECDSA |
|
ED25519 |
|
More details can be found in the wiki page bwUniCluster2.0/Login.
Building dependencies¶
Boost¶
# last update: June 2023
module load compiler/gnu/10.2 mpi/openmpi/4.1
mkdir boost-build
cd boost-build
BOOST_VERSION=1.82.0
BOOST_DOMAIN="https://boostorg.jfrog.io/artifactory/main"
BOOST_ROOT="${HOME}/bin/boost_mpi_${BOOST_VERSION//./_}"
mkdir -p "${BOOST_ROOT}"
curl -sL "${BOOST_DOMAIN}/release/${BOOST_VERSION}/source/boost_${BOOST_VERSION//./_}.tar.bz2" | tar xj
cd "boost_${BOOST_VERSION//./_}"
echo 'using mpi ;' > tools/build/src/user-config.jam
./bootstrap.sh --with-libraries=filesystem,system,mpi,serialization,test
./b2 -j 4 install --prefix="${BOOST_ROOT}"
FFTW¶
# last update: June 2023
module load compiler/gnu/10.2 mpi/openmpi/4.1
mkdir fftw-build
cd fftw-build
FFTW3_VERSION=3.3.10
FFTW3_ROOT="${HOME}/bin/fftw_${FFTW3_VERSION//./_}"
curl -sL "https://www.fftw.org/fftw-${FFTW3_VERSION}.tar.gz" | tar xz
cd "fftw-${FFTW3_VERSION}"
./configure --enable-shared --enable-mpi --enable-threads --enable-openmp \
--disable-fortran --enable-avx --prefix="${FFTW3_ROOT}"
make -j 4
make install
make clean
CUDA¶
# last update: June 2023
module load compiler/gnu/10.2 devel/cuda/12.0
export CLUSTER_CUDA_ROOT="${HOME}/bin/cuda_12_0"
mkdir -p "${CLUSTER_CUDA_ROOT}/lib"
ln -s "${CUDA_HOME}/targets/x86_64-linux/lib/stubs/libcuda.so" "${CLUSTER_CUDA_ROOT}/lib/libcuda.so"
ln -s "${CUDA_HOME}/targets/x86_64-linux/lib/stubs/libcuda.so" "${CLUSTER_CUDA_ROOT}/lib/libcuda.so.1"
Building software¶
ESPResSo¶
Release 4.2:
# last update: June 2023
module load compiler/gnu/10.2 mpi/openmpi/4.1 devel/cmake/3.23.3 devel/cuda/12.0 \
lib/hdf5/1.12.2-gnu-10.2-openmpi-4.1 devel/python/3.8.6_gnu_10.2
CLUSTER_FFTW3_VERSION=3.3.10
CLUSTER_BOOST_VERSION=1.82.0
export BOOST_ROOT="${HOME}/bin/boost_mpi_${CLUSTER_BOOST_VERSION//./_}"
export FFTW3_ROOT="${HOME}/bin/fftw_${CLUSTER_FFTW3_VERSION//./_}"
export CUDA_ROOT="${HOME}/bin/cuda_12_0"
export LD_LIBRARY_PATH="${BOOST_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${FFTW3_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_HOME}/targets/x86_64-linux/lib/stubs"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_ROOT}/lib"
git clone --recursive --branch 4.2 --origin upstream \
https://github.com/espressomd/espresso.git espresso-4.2
cd espresso-4.2
python3 -m pip install --user -c "requirements.txt" cython setuptools numpy scipy vtk h5py
mkdir build
cd build
cp ../maintainer/configs/maxset.hpp myconfig.hpp
sed -i "/ADDITIONAL_CHECKS/d" myconfig.hpp
cmake .. -D CMAKE_BUILD_TYPE=Release -D WITH_CUDA=ON \
-D WITH_CCACHE=OFF -D WITH_SCAFACOS=OFF -D WITH_HDF5=ON
make -j 4
Release 4.3:
# last update: June 2023
module load compiler/gnu/10.2 mpi/openmpi/4.1 devel/cmake/3.23.3 devel/cuda/12.0 \
lib/hdf5/1.12.2-gnu-10.2-openmpi-4.1 devel/python/3.8.6_gnu_10.2
CLUSTER_FFTW3_VERSION=3.3.10
CLUSTER_BOOST_VERSION=1.82.0
export BOOST_ROOT="${HOME}/bin/boost_mpi_${CLUSTER_BOOST_VERSION//./_}"
export FFTW3_ROOT="${HOME}/bin/fftw_${CLUSTER_FFTW3_VERSION//./_}"
export CUDA_ROOT="${HOME}/bin/cuda_12_0"
export LD_LIBRARY_PATH="${BOOST_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${FFTW3_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_HOME}/targets/x86_64-linux/lib/stubs"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_ROOT}/lib"
git clone --recursive --branch python --origin upstream \
https://github.com/espressomd/espresso.git espresso-4.3
cd espresso-4.3
python3 -m pip install --user -c "requirements.txt" cython setuptools numpy scipy vtk h5py
mkdir build
cd build
cp ../maintainer/configs/maxset.hpp myconfig.hpp
sed -i "/ADDITIONAL_CHECKS/d" myconfig.hpp
cmake .. -D CUDAToolkit_ROOT="/opt/bwhpc/common/devel/cuda/12.0" \
-D CMAKE_BUILD_TYPE=Release -D ESPRESSO_BUILD_WITH_CUDA=ON \
-D ESPRESSO_BUILD_WITH_CCACHE=OFF -D ESPRESSO_BUILD_WITH_WALBERLA=ON \
-D ESPRESSO_BUILD_WITH_SCAFACOS=OFF -D ESPRESSO_BUILD_WITH_HDF5=ON
make -j 4
Submitting jobs¶
Batch command:
sbatch --partition=dev_multiple --nodes=2 --ntasks-per-node=2 job.sh
Job script:
#!/bin/bash
#SBATCH --job-name=test
#SBATCH --time=00:10:00
#SBATCH --output %j.stdout
#SBATCH --error %j.stderr
# last update: July 2023
module load compiler/gnu/10.2 mpi/openmpi/4.1 devel/cmake/3.23.3 devel/cuda/12.0 \
lib/hdf5/1.12.2-gnu-10.2-openmpi-4.1 devel/python/3.8.6_gnu_10.2
CLUSTER_FFTW3_VERSION=3.3.10
CLUSTER_BOOST_VERSION=1.82.0
export BOOST_ROOT="${HOME}/bin/boost_mpi_${CLUSTER_BOOST_VERSION//./_}"
export FFTW3_ROOT="${HOME}/bin/fftw_${CLUSTER_FFTW3_VERSION//./_}"
export CUDA_ROOT="${HOME}/bin/cuda_12_0"
export LD_LIBRARY_PATH="${BOOST_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${FFTW3_ROOT}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_HOME}/targets/x86_64-linux/lib/stubs"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}${CUDA_ROOT}/lib"
export PYTHONPATH="${HOME}/espresso-4.3/build-maxset/src/python${PYTHONPATH:+:$PYTHONPATH}"
mpiexec --bind-to core --map-by core python3 script.py
The documentation recommends using the MPI-specific launcher,
i.e. mpiexec
or mpirun
for OpenMPI, instead of SLURM’s srun
.
The number of processes and node information is automatically
passed to the launcher.
When using srun
instead of the MPI-specific launcher,
if the job script loads python via module load
,
it is necessary to preload the SLURM shared objects, like so:
LD_PRELOAD=/usr/lib64/slurm/libslurmfull.so \
sbatch --partition=dev_multiple --nodes=2 --ntasks-per-node=2 job.sh
Otherwise, the following fatal error is triggered:
python3: error: plugin_load_from_file: dlopen(/usr/lib64/slurm/auth_munge.so): /usr/lib64/slurm/auth_munge.so: undefined symbol: slurm_conf
python3: error: Couldn't load specified plugin name for auth/munge: Dlopen of plugin file failed
python3: error: cannot create auth context for auth/munge
python3: fatal: failed to initialize auth plugin