10852
Comment:
|
17106
|
Deletions are marked like this. | Additions are marked like this. |
Line 81: | Line 81: |
During the installation phase we compiled simple "MPI-HelloWorld" program. Start it without torque |
During the installation phase we compiled a simple MPI Hello World program. Start it without Torque |
Line 129: | Line 129: |
#NOTES: # in whatever mode you are running your Eucalyptus system, you will have two interfaces, one is called public(you can reach the instances from outside), # one is called inside(you can reach the instances usually only from inside, or maybe from your front end) # you have to specify a TORQUE SERVER and one or more TORQUE NODES, because I am accessing those instances you have to specify working IP addresses. # example: SYSTEM mode, both interfaces are using the same address, so there is no # example: MANAGED modes, public and private interface are different, TORQUE has to be setup for the PRIVATE mode (intra-cloud communtcation need firewall settings) # usage: bash start_torque.sh --verbose -s="192.168.0.13" -n="192.168.0.14,192.168.0.17,192.168.0.45" -k="~/.euca/mykey.priv" # default VERBOSE=0 IN_INSTANCE=0 echo `hostname` : `/sbin/ifconfig eth0 | grep "inet addr" | awk '{print $2}' | sed 's/addr\://'` for i in $* do case $i in -s=*|--torque-server=*) # remove option from string PUBLIC_TORQUE_SERVER_IP=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` echo $PUBLIC_TORQUE_SERVER_IP ;; -n=*|--torque-nodes=*) # remove option from string NIPS=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` PUBLIC_NODES_IP=`echo $NIPS | sed 's/\,/ /g'` echo $NIPS echo $PUBLIC_NODES_IP ;; -k=*|--key=*) # remove option from string KEY=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` echo $KEY ;; --verbose) VERBOSE=1 ;; -i|--in-instance) IN_INSTANCE=1 ;; -m=*|--with-mpi=*) MPI=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` #TODO: only 0 or 1 are feasible values ;; *) echo "unknown option" ;; esac done cat > keygen_in_instance.sh << EOF #!/bin/bash su guest -c 'ssh-keygen -t rsa -N "" -f /home/guest/.ssh/id_rsa' EOF chmod 755 keygen_in_instance.sh # BEGIN execution on master ################################################# if [ $IN_INSTANCE -eq 0 ] ; then # join server and nodes if [[ $PUBLIC_NODES_IP == *$PUBLIC_TORQUE_SERVER_IP* ]] then ALL_INSTANCES="$PUBLIC_NODES_IP" else ALL_INSTANCES="$PUBLIC_TORQUE_SERVER_IP $PUBLIC_NODES_IP" fi echo $ALL_INSTANCES # copy setup-torque-script to eucalyptus instances for NODE_IP in `echo $ALL_INSTANCES` do echo $NODE_IP # make this host known to ~/.ssh/known_hosts eval "ssh -i $KEY -o StrictHostKeychecking=no root@$NODE_IP echo ''" eval "/usr/bin/scp -p -i $KEY start_torque.sh root@$NODE_IP:/root/start_torque.sh" # MPI example eval "scp -p -i $KEY compileMPI.sh helloworld.c root@$NODE_IP:/root/" # start script in instance eval "ssh -X -i $KEY root@$NODE_IP \"/root/start_torque.sh -s=\"$PUBLIC_TORQUE_SERVER_IP\" -n=\"$NIPS\" -i -m=$MPI \"" done # generate keys in instances - for user guest for NODE_IP in `echo $ALL_INSTANCES` do echo $NODE_IP eval "/usr/bin/scp -p -i $KEY keygen_in_instance.sh root@$NODE_IP:/root/keygen_in_instance.sh" eval "ssh -X -i $KEY root@$NODE_IP \"/root/keygen_in_instance.sh\"" done # distribute keys for NODE_IP in `echo $ALL_INSTANCES` do # distribute this key to all other nodes for NODE_IP2 in `echo $ALL_INSTANCES` do echo $NODE_IP2 eval "/usr/bin/scp -p -i $KEY root@$NODE_IP:/home/guest/.ssh/id_rsa.pub /tmp/id_rsa.pub" eval "/usr/bin/scp -p -i $KEY /tmp/id_rsa.pub root@$NODE_IP2:/tmp/id_rsa.pub" eval "ssh -X -i $KEY root@$NODE_IP2 \"cat /tmp/id_rsa.pub >> /home/guest/.ssh/authorized_keys\"" #TODO, I need an entry in known_hosts, for now the following happens from within the instances #eval "ssh -X -i $KEY root@$NODE_IP \"ssh -o StrictHostKeychecking=no guest@$NODE_IP2 & echo '' & wait\"" done eval "ssh -X -i $KEY root@$NODE_IP /root/hosts.sh" done exit # on master don't execute commands for instances fi # END execution on master ################################################# # BEGIN execution in instance ############################################### usage() { cat << EOF usage: $0 options This script starts the torque environment. OPTIONS: -h Show this message -n nodes e.g. "192.168.0.14,192.168.0.14" -s torque server ip e.g. "192.168.0.13" -k key file -v Verbose -m With MPI support example: start_torque.sh --verbose -s="192.168.0.13" -n="192.168.0.14,192.168.0.17,192.168.0.45" -k="~/.euca/mykey.priv" EOF } #SERVER_IP NIPS KEY #if [[ -z $PUBLIC_NODES_IP ]] || [[ -z $PUBLIC_TORQUE_SERVER_IP ]] #then # usage # exit 1 #fi |
|
Line 162: | Line 312: |
#SET values #conf1 PUBLIC_TORQUE_SERVER_IP="192.168.0.2" PUBLIC_NODES="192.168.0.3 192.168.0.4 192.168.0.5 192.168.0.6 192.168.0.7 192.168.0.8" |
|
Line 174: | Line 319: |
#MODE="system" | |
Line 184: | Line 329: |
PUBLIC_INSTANCE_IP=`curl -s $METADATA_URL/public-ipv4` | if [ $MODE == "public" ] ; then PUBLIC_INSTANCE_IP=`/sbin/ifconfig eth0 | grep "inet addr" | awk '{print $2}' | sed 's/addr\://'` else |
Line 186: | Line 334: |
PUBLIC_INSTANCE_IP=`curl -s $METADATA_URL/public-ipv4` fi |
|
Line 199: | Line 350: |
NODES=$PUBLIC_NODES | NODES=$PUBLIC_NODES_IP |
Line 259: | Line 410: |
# install libopenmpi-dev install_package "libopenmpi-dev" # install openmpi-bin install_package "openmpi-bin" |
# install OpenMPI packages if [ $MPI -eq 1 ] ; then install_package "libopenmpi-dev" install_package "openmpi-bin" #compile MPI test program bash compileMPI.sh fi |
Line 266: | Line 419: |
Line 273: | Line 427: |
echo "$NODE_IP $NODE_HOSTNAME" >> /etc/torque/hostfile | mkdir -p /etc/torque echo "$NODE_HOSTNAME slots=1" >> /etc/torque/hostfile |
Line 279: | Line 434: |
for NODE_IP in `echo $PUBLIC_NODES` | for NODE_IP in `echo $PUBLIC_NODES_IP` |
Line 282: | Line 437: |
echo "$NODE_IP $NODE_HOSTNAME" >> /etc/hosts | if [ $INSTANCE_IP != $TORQUE_SERVER_IP ] || [ $NODE_IP != $TORQUE_SERVER_IP ]; then if ! egrep -q "$NODE_IP|$NODE_HOSTNAME" /etc/hosts ; then echo "$NODE_IP $NODE_HOSTNAME" >> /etc/hosts fi fi |
Line 284: | Line 443: |
echo "$NODE_IP $NODE_HOSTNAME" >> /etc/torque/hostfile | mkdir -p /etc/torque if ! egrep -q "$NODE_HOSTNAME" /etc/torque/hostfile ; then echo "$NODE_HOSTNAME slots=1" >> /etc/torque/hostfile echo "(su - guest -c \"ssh -t -t -o StrictHostKeychecking=no guest@$NODE_HOSTNAME echo ''\")& wait" >> /root/hosts.sh # for key distribution fi |
Line 286: | Line 449: |
fi |
if ! egrep -q "$PUBLIC_TORQUE_SERVER_HOSTNAME" /etc/torque/hostfile ; then echo "(su - guest -c \"ssh -t -t -o StrictHostKeychecking=no guest@$PUBLIC_TORQUE_SERVER_HOSTNAME echo ''\")& wait" >> /root/hosts.sh # for key distribution fi fi chmod 755 /root/hosts.sh |
Line 294: | Line 460: |
echo "127.0.1.1 $PUBLIC_INSTANCE_HOSTNAME" >> /etc/hosts echo "$PRIVATE_INSTANCE_IP $PRIVATE_INSTANCE_HOSTNAME" >> /etc/hosts |
if ! egrep -q "127.0.1.1|$PUBLIC_INSTANCE_HOSTNAME" /etc/hosts ; then echo "127.0.1.1 $PUBLIC_INSTANCE_HOSTNAME" >> /etc/hosts fi # echo "$PRIVATE_INSTANCE_IP $PRIVATE_INSTANCE_HOSTNAME" >> /etc/hosts |
Line 298: | Line 466: |
echo "$TORQUE_SERVER_IP $TORQUE_SERVER_HOSTNAME" >> /etc/hosts | if ! egrep -q "$TORQUE_SERVER_IP|$TORQUE_SERVER_HOSTNAME" /etc/hosts ; then echo "$TORQUE_SERVER_IP $TORQUE_SERVER_HOSTNAME" >> /etc/hosts fi |
Line 312: | Line 482: |
apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y install torque-mom torque-server torque-scheduler torque-client #aptitude -y install torque-mom torque-server torque-scheduler torque-client else |
apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y install torque-server torque-scheduler torque-client #aptitude -y install torque-server torque-scheduler torque-client fi if [[ $PUBLIC_NODES_IP == *$INSTANCE_IP* ]]; then |
Line 322: | Line 494: |
Line 323: | Line 496: |
USER=userA | USER=auser |
Line 341: | Line 514: |
## on TORQUE mom echo $TORQUE_SERVER_HOSTNAME > /etc/torque/server_name echo "\$timeout 120" > /var/spool/torque/mom_priv/config # more options possible (NFS...) echo "\$loglevel 5" >> /var/spool/torque/mom_priv/config # more options possible (NFS...) /etc/init.d/torque-mom restart cat /var/spool/torque/mom_logs/$DATE ## on TORQUE server |
## for TORQUE mom if [[ $PUBLIC_NODES_IP == *$INSTANCE_IP* ]]; then echo $TORQUE_SERVER_HOSTNAME > /etc/torque/server_name echo "\$timeout 120" > /var/spool/torque/mom_priv/config # more options possible (NFS...) echo "\$loglevel 5" >> /var/spool/torque/mom_priv/config # more options possible (NFS...) /etc/init.d/torque-mom restart cat /var/spool/torque/mom_logs/$DATE fi ## for TORQUE server |
Line 379: | Line 552: |
#debug | |
Line 383: | Line 556: |
fi }}} |
cat /etc/torque/server_name fi }}} |
Running Torque inside of Eucalyptus
We describe how to setup a Torque cluster system within a Eucalyptus cloud.
$ source ~/.euca/eucarc
Specify a Squeeze image
$ EMI=emi-1AF00C98
Start two instances of our Squeeze image
$ euca-run-instances $EMI -k mykey -t c1.medium -n2 RESERVATION r-4488080C myuser myuser-default INSTANCE i-57E309BE emi-1AF00C98 0.0.0.0 0.0.0.0 pending mykey 2010-09-13T02:31:51.172Z eki-D224100C eri-059910F2 INSTANCE i-4C1F0986 emi-1AF00C98 0.0.0.0 0.0.0.0 pending mykey 2010-09-13T02:31:51.173Z eki-D224100C eri-059910F2
After a few seconds it will be running
$ euca-describe-instances RESERVATION r-4488080C myuser default INSTANCE i-4C1F0986 emi-1AF00C98 192.168.0.14 192.168.0.14 running mykey 1 c1.medium 2010-09-13T02:31:51.173Z mycloud eki-D224100C eri-059910F2 INSTANCE i-57E309BE emi-1AF00C98 192.168.0.15 192.168.0.15 running mykey 0 c1.medium 2010-09-13T02:31:51.172Z mycloud eki-D224100C eri-059910F2
Let's say you want to start a torque server on 192.168.0.14 and two torque worker on 192.168.0.14 and 192.168.0.15, MPI enabled
- $ bash start_torque.sh -s="192.168.0.14" -n="192.168.0.14,192.168.0.15" -k="~/.euca/mykey.priv" -m=1
}}}
This will install all necessary torque packages in the instances. It might take a few minutes, depending on the internet connection and processor speed of the instances.
Connect to a instance as root with your key
ssh -X -i ~/.euca/mykey.priv root@192.168.0.14
virtual: Switch to the guest user
su - guest
Check if nodes are up
pbsnodes
Perform some simple tests
echo "sleep 10" | qsub echo "sleep 5" | qsub echo "hostname" | qsub echo "sleep 15" | qsub echo "hostname" | qsub echo "sleep 3" | qsub
Look at the queue
qstat
Let sleep 2 worker nodes
echo "sleep 10" | qsub -l nodes=2
Check if both nodes are in state 'job-exclusive'
pbsnodes
During the installation phase we compiled a simple MPI Hello World program.
Start it without Torque
$ mpiexec -n 4 /tmp/hello.out Hello MPI from the server process! Hello MPI! mesg from 1 of 4 on ip-192-168-0-14 Hello MPI! mesg from 2 of 4 on ip-192-168-0-14 Hello MPI! mesg from 3 of 4 on ip-192-168-0-14
Start it with Torque (without -tm support)
cat <<EOF > mpi-test_2_1_mpirun #PBS -N helloworld #PBS -l nodes=2:ppn=1 cd $PBS_O_WORKDIR /usr/bin/mpirun -np 2 --hostfile /etc/torque/hostfile -v -v -v /tmp/hello.out EOF qsub mpi-test_2_1_mpirun
Check the output files
cat helloworld.o* Hello MPI from the server process! Hello MPI! mesg from 1 of 2 on ip-192-168-0-15 cat helloworld.e*
Start it with Torque (with -tm support)
package is ready but not in squeeze yet
example script for setting up torque:
set -ex #NOTES: # in whatever mode you are running your Eucalyptus system, you will have two interfaces, one is called public(you can reach the instances from outside), # one is called inside(you can reach the instances usually only from inside, or maybe from your front end) # you have to specify a TORQUE SERVER and one or more TORQUE NODES, because I am accessing those instances you have to specify working IP addresses. # example: SYSTEM mode, both interfaces are using the same address, so there is no # example: MANAGED modes, public and private interface are different, TORQUE has to be setup for the PRIVATE mode (intra-cloud communtcation need firewall settings) # usage: bash start_torque.sh --verbose -s="192.168.0.13" -n="192.168.0.14,192.168.0.17,192.168.0.45" -k="~/.euca/mykey.priv" # default VERBOSE=0 IN_INSTANCE=0 echo `hostname` : `/sbin/ifconfig eth0 | grep "inet addr" | awk '{print $2}' | sed 's/addr\://'` for i in $* do case $i in -s=*|--torque-server=*) # remove option from string PUBLIC_TORQUE_SERVER_IP=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` echo $PUBLIC_TORQUE_SERVER_IP ;; -n=*|--torque-nodes=*) # remove option from string NIPS=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` PUBLIC_NODES_IP=`echo $NIPS | sed 's/\,/ /g'` echo $NIPS echo $PUBLIC_NODES_IP ;; -k=*|--key=*) # remove option from string KEY=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` echo $KEY ;; --verbose) VERBOSE=1 ;; -i|--in-instance) IN_INSTANCE=1 ;; -m=*|--with-mpi=*) MPI=`echo $i | sed 's/[-a-zA-Z0-9]*=//'` #TODO: only 0 or 1 are feasible values ;; *) echo "unknown option" ;; esac done cat > keygen_in_instance.sh << EOF #!/bin/bash su guest -c 'ssh-keygen -t rsa -N "" -f /home/guest/.ssh/id_rsa' EOF chmod 755 keygen_in_instance.sh # BEGIN execution on master ################################################# if [ $IN_INSTANCE -eq 0 ] ; then # join server and nodes if [[ $PUBLIC_NODES_IP == *$PUBLIC_TORQUE_SERVER_IP* ]] then ALL_INSTANCES="$PUBLIC_NODES_IP" else ALL_INSTANCES="$PUBLIC_TORQUE_SERVER_IP $PUBLIC_NODES_IP" fi echo $ALL_INSTANCES # copy setup-torque-script to eucalyptus instances for NODE_IP in `echo $ALL_INSTANCES` do echo $NODE_IP # make this host known to ~/.ssh/known_hosts eval "ssh -i $KEY -o StrictHostKeychecking=no root@$NODE_IP echo ''" eval "/usr/bin/scp -p -i $KEY start_torque.sh root@$NODE_IP:/root/start_torque.sh" # MPI example eval "scp -p -i $KEY compileMPI.sh helloworld.c root@$NODE_IP:/root/" # start script in instance eval "ssh -X -i $KEY root@$NODE_IP \"/root/start_torque.sh -s=\"$PUBLIC_TORQUE_SERVER_IP\" -n=\"$NIPS\" -i -m=$MPI \"" done # generate keys in instances - for user guest for NODE_IP in `echo $ALL_INSTANCES` do echo $NODE_IP eval "/usr/bin/scp -p -i $KEY keygen_in_instance.sh root@$NODE_IP:/root/keygen_in_instance.sh" eval "ssh -X -i $KEY root@$NODE_IP \"/root/keygen_in_instance.sh\"" done # distribute keys for NODE_IP in `echo $ALL_INSTANCES` do # distribute this key to all other nodes for NODE_IP2 in `echo $ALL_INSTANCES` do echo $NODE_IP2 eval "/usr/bin/scp -p -i $KEY root@$NODE_IP:/home/guest/.ssh/id_rsa.pub /tmp/id_rsa.pub" eval "/usr/bin/scp -p -i $KEY /tmp/id_rsa.pub root@$NODE_IP2:/tmp/id_rsa.pub" eval "ssh -X -i $KEY root@$NODE_IP2 \"cat /tmp/id_rsa.pub >> /home/guest/.ssh/authorized_keys\"" #TODO, I need an entry in known_hosts, for now the following happens from within the instances #eval "ssh -X -i $KEY root@$NODE_IP \"ssh -o StrictHostKeychecking=no guest@$NODE_IP2 & echo '' & wait\"" done eval "ssh -X -i $KEY root@$NODE_IP /root/hosts.sh" done exit # on master don't execute commands for instances fi # END execution on master ################################################# # BEGIN execution in instance ############################################### usage() { cat << EOF usage: $0 options This script starts the torque environment. OPTIONS: -h Show this message -n nodes e.g. "192.168.0.14,192.168.0.14" -s torque server ip e.g. "192.168.0.13" -k key file -v Verbose -m With MPI support example: start_torque.sh --verbose -s="192.168.0.13" -n="192.168.0.14,192.168.0.17,192.168.0.45" -k="~/.euca/mykey.priv" EOF } #SERVER_IP NIPS KEY #if [[ -z $PUBLIC_NODES_IP ]] || [[ -z $PUBLIC_TORQUE_SERVER_IP ]] #then # usage # exit 1 #fi function install_package { PACKAGE=$1 if [ "`dpkg-query -W -f='${Status}\n' $PACKAGE`" != "install ok installed" ] ; then apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y install $PACKAGE #aptitude -y install $PACKAGE if [ $? -ne 0 ] ; then echo "aptitude install $PACKAGE failed" fi else echo "package $PACKAGE is already installed" fi } export DEBIAN_FRONTEND="noninteractive" export APT_LISTCHANGES_FRONTEND="none" API_VERSION="2008-02-01" METADATA_URL="http://169.254.169.254/$API_VERSION/meta-data" CURL="/usr/bin/curl" # those variables are needed for the locales package export LANGUAGE=en_US.UTF-8 export LANG=en_US.UTF-8 export LC_ALL=en_US.UTF-8 # for dialog frontend export PATH=$PATH:/sbin:/usr/sbin:/usr/local/sbin export TERM=linux PRIVATE_TORQUE_SERVER_IP="172.16.1.2" PRIVATE_NODES="172.16.1.2 172.16.1.3 172.16.1.4 172.16.1.5 172.16.1.6 172.16.1.7 172.16.1.8" MODE="public" #MODE="private" #MODE="system" PUBLIC_TORQUE_SERVER_HOSTNAME=ip-`echo $PUBLIC_TORQUE_SERVER_IP | sed 's/\./-/g'` echo $PUBLIC_TORQUE_SERVER_IP $PUBLIC_TORQUE_SERVER_HOSTNAME PRIVATE_TORQUE_SERVER_HOSTNAME=ip-`echo $PRIVATE_TORQUE_SERVER_IP | sed 's/\./-/g'` echo $PRIVATE_TORQUE_SERVER_IP $PRIVATE_TORQUE_SERVER_HOSTNAME #GET INSTANCE IPs, create hostnames if [ $MODE == "public" ] ; then PUBLIC_INSTANCE_IP=`/sbin/ifconfig eth0 | grep "inet addr" | awk '{print $2}' | sed 's/addr\://'` else #PUBLIC_INSTANCE_IP=192.168.0.115 PUBLIC_INSTANCE_IP=`curl -s $METADATA_URL/public-ipv4` fi #PUBLIC_INSTANCE_HOSTNAME=`curl -s $METADATA_URL/public-hostname` PUBLIC_INSTANCE_HOSTNAME=ip-`echo $PUBLIC_INSTANCE_IP | sed 's/\./-/g'` echo $PUBLIC_INSTANCE_IP $PUBLIC_INSTANCE_HOSTNAME PRIVATE_INSTANCE_IP=`/sbin/ifconfig eth0 | grep "inet addr" | awk '{print $2}' | sed 's/addr\://'` PRIVATE_INSTANCE_HOSTNAME=ip-`echo $PRIVATE_INSTANCE_IP | sed 's/\./-/g'` echo $PRIVATE_INSTANCE_IP $PRIVATE_INSTANCE_HOSTNAME #using PUBLIC or PRIVATE interface if [ $MODE == "public" ] ; then INSTANCE_HOSTNAME=$PUBLIC_INSTANCE_HOSTNAME NODES=$PUBLIC_NODES_IP INSTANCE_IP=$PUBLIC_INSTANCE_IP TORQUE_SERVER_IP=$PUBLIC_TORQUE_SERVER_IP TORQUE_SERVER_HOSTNAME=$PUBLIC_TORQUE_SERVER_HOSTNAME else if [ $MODE == "private" ] ; then INSTANCE_HOSTNAME=$PRIVATE_INSTANCE_HOSTNAME NODES=$PRIVATE_NODES INSTANCE_IP=$PRIVATE_INSTANCE_IP TORQUE_SERVER_IP=$PRIVATE_TORQUE_SERVER_IP TORQUE_SERVER_HOSTNAME=$PRIVATE_TORQUE_SERVER_HOSTNAME else echo "please specify private or public interface" fi fi # using Google's nameserver echo "nameserver 8.8.8.8" >> /etc/resolv.conf # update aptitude first #echo "deb http://ftp.us.debian.org/debian squeeze main" > /etc/apt/sources.list #echo "deb http://security.debian.org/ squeeze/updates main" >> /etc/apt/sources.list #aptitude update apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y update if [ $? -ne 0 ] ; then echo "aptitude update failed" fi # get rid of some error messages because of missing locales package install_package locales echo "en_US.UTF-8 UTF-8" > /etc/locale.gen locale-gen # install portmap for NFS install_package portmap #TODO mount here # install nmap install_package nmap nmap localhost -p 1-20000 # install lsb-release install_package lsb-release # Print some Information about the Operating System DISTRIBUTOR=`lsb_release -i | awk '{print $3}'` CODENAME=`lsb_release -c | awk '{print $2}'` echo $DISTRIBUTOR $CODENAME # install ntpdate install_package ntpdate ###ntpdate pool.ntp.org ntpdate ntp.ubuntu.com # install OpenMPI packages if [ $MPI -eq 1 ] ; then install_package "libopenmpi-dev" install_package "openmpi-bin" #compile MPI test program bash compileMPI.sh fi # make hostnames known to all the TORQUE nodes and server/scheduler if [ $MODE == "private" ] ; then for NODE_IP in `echo $PRIVATE_NODES` do NODE_HOSTNAME=ip-`echo $NODE_IP | sed 's/\./-/g'` echo "$NODE_IP $NODE_HOSTNAME" >> /etc/hosts #MPI support mkdir -p /etc/torque echo "$NODE_HOSTNAME slots=1" >> /etc/torque/hostfile done fi if [ $MODE == "public" ] ; then for NODE_IP in `echo $PUBLIC_NODES_IP` do NODE_HOSTNAME=ip-`echo $NODE_IP | sed 's/\./-/g'` if [ $INSTANCE_IP != $TORQUE_SERVER_IP ] || [ $NODE_IP != $TORQUE_SERVER_IP ]; then if ! egrep -q "$NODE_IP|$NODE_HOSTNAME" /etc/hosts ; then echo "$NODE_IP $NODE_HOSTNAME" >> /etc/hosts fi fi #MPI support mkdir -p /etc/torque if ! egrep -q "$NODE_HOSTNAME" /etc/torque/hostfile ; then echo "$NODE_HOSTNAME slots=1" >> /etc/torque/hostfile echo "(su - guest -c \"ssh -t -t -o StrictHostKeychecking=no guest@$NODE_HOSTNAME echo ''\")& wait" >> /root/hosts.sh # for key distribution fi done if ! egrep -q "$PUBLIC_TORQUE_SERVER_HOSTNAME" /etc/torque/hostfile ; then echo "(su - guest -c \"ssh -t -t -o StrictHostKeychecking=no guest@$PUBLIC_TORQUE_SERVER_HOSTNAME echo ''\")& wait" >> /root/hosts.sh # for key distribution fi fi chmod 755 /root/hosts.sh ## on TORQUE server if [ $INSTANCE_IP == $TORQUE_SERVER_IP ]; then #this one is for the scheduler, if using the public interface if ! egrep -q "127.0.1.1|$PUBLIC_INSTANCE_HOSTNAME" /etc/hosts ; then echo "127.0.1.1 $PUBLIC_INSTANCE_HOSTNAME" >> /etc/hosts fi # echo "$PRIVATE_INSTANCE_IP $PRIVATE_INSTANCE_HOSTNAME" >> /etc/hosts else if ! egrep -q "$TORQUE_SERVER_IP|$TORQUE_SERVER_HOSTNAME" /etc/hosts ; then echo "$TORQUE_SERVER_IP $TORQUE_SERVER_HOSTNAME" >> /etc/hosts fi fi # need to set a hostname before installing torque packages echo $INSTANCE_HOSTNAME > /etc/hostname # preserve hostname if rebooting is necessary hostname $INSTANCE_HOSTNAME # immediately change #getent hosts `hostname` #PUBLIC_INSTANCE_HOSTNAME=`curl -s $METADATA_URL/public-hostname` #echo "deb http://ftp.us.debian.org/debian sid main" > /etc/apt/sources.list apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y update if [ $INSTANCE_IP == $TORQUE_SERVER_IP ]; then apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y install torque-server torque-scheduler torque-client #aptitude -y install torque-server torque-scheduler torque-client fi if [[ $PUBLIC_NODES_IP == *$INSTANCE_IP* ]]; then apt-get -o Dpkg::Options::="--force-confnew" --force-yes -y install torque-mom #aptitude -y install torque-mom fi ## fix /tmp directory in debian eucalyptus image chmod 777 /tmp ## add user to all nodes USER=auser if id $USER > /dev/null 2>&1 then echo "user exist!" else adduser $USER --disabled-password --gecos "" fi #echo $PUBLIC_TORQUE_SERVER_HOSTNAME > /etc/torque/server_name #echo $PUBLIC_INSTANCE_HOSTNAME > /etc/hostname # preserve hostname if rebooting is necessary #hostname $PUBLIC_INSTANCE_HOSTNAME # immediately change DATE=`date '+%Y%m%d'` ## for TORQUE mom if [[ $PUBLIC_NODES_IP == *$INSTANCE_IP* ]]; then echo $TORQUE_SERVER_HOSTNAME > /etc/torque/server_name echo "\$timeout 120" > /var/spool/torque/mom_priv/config # more options possible (NFS...) echo "\$loglevel 5" >> /var/spool/torque/mom_priv/config # more options possible (NFS...) /etc/init.d/torque-mom restart cat /var/spool/torque/mom_logs/$DATE fi ## for TORQUE server if [ $INSTANCE_IP == $TORQUE_SERVER_IP ]; then echo $TORQUE_SERVER_HOSTNAME > /etc/torque/server_name rm -f /var/spool/torque/server_priv/nodes touch /var/spool/torque/server_priv/nodes for NODE_IP in `echo $NODES` do NODE_HOSTNAME=ip-`echo $NODE_IP | sed 's/\./-/g'` echo -ne "$NODE_HOSTNAME np=1\n" >> /var/spool/torque/server_priv/nodes done /etc/init.d/torque-server restart /etc/init.d/torque-scheduler restart qmgr -c "s s scheduling=true" qmgr -c "c q batch queue_type=execution" qmgr -c "s q batch started=true" qmgr -c "s q batch enabled=true" qmgr -c "s q batch resources_default.nodes=1" qmgr -c "s q batch resources_default.walltime=3600" # had to set this for MPI, TODO: double check qmgr -c "s q batch resources_min.nodes=1" qmgr -c "s s default_queue=batch" # let all nodes submit jobs, not only the server qmgr -c "s s allow_node_submit=true" #qmgr -c 'set server submit_hosts += $TORQUE_SERVER_IP' #qmgr -c 'set server submit_hosts += $INSTANCE_IP' # adding extra nodes #qmgr -c "create node $INSTANCE_HOSTNAME" #debug cat /var/spool/torque/server_logs/$DATE qstat -q pbsnodes -a cat /etc/torque/server_name fi