Merge branch 'master' into el5

Conflicts:
	.gitignore
	sources
	torque-munge-size.patch
	torque.spec
This commit is contained in:
David Brown 2015-04-07 21:41:16 -07:00
commit 72f042a527
10 changed files with 1141 additions and 419 deletions

16
.gitignore vendored
View File

@ -1,4 +1,14 @@
torque-2.3.10.tar.gz
/torque-2.3.12.tar.gz
/torque-2.3.13.tar.gz
/torque-2.5.2.tar.gz
/torque-2.5.3.tar.gz
/torque-2.5.4.tar.gz
/torque-2.5.5.tar.gz
/torque-2.5.7.tar.gz
/torque-3.0.0.tar.gz
/torque-3.0.0-snap.201102011355.tar.gz
/torque-3.0.1.tar.gz
/torque-3.0.2.tar.gz
/torque-3.0.3.tar.gz
/torque-3.0.4.tar.gz
/torque-4.2.6.1.tar.gz
/torque-4.2.8.tar.gz
/torque-4.2.10.tar.gz

View File

@ -1,20 +0,0 @@
To setup a basic single-node localhost-only batch system, install the
torque-server, torque-mom, and torque-scheduler packages, and do something like
this:
/sbin/chkconfig pbs_mom on
/sbin/chkconfig pbs_server on
/sbin/chkconfig pbs_sched on
/bin/hostname --long > %{torquehomedir}/server_priv/nodes
/bin/hostname --long > %{torquehomedir}/server_name
service pbs_server start
qmgr -c "s s scheduling=true"
qmgr -c "c q batch queue_type=execution"
qmgr -c "s q batch started=true"
qmgr -c "s q batch enabled=true"
qmgr -c "s q batch resources_default.nodes=1"
qmgr -c "s q batch resources_default.walltime=3600"
qmgr -c "s s default_queue=batch"
service pbs_mom restart
service pbs_sched restart

84
README.Fedora Normal file
View File

@ -0,0 +1,84 @@
This README describes how to get the most basic working
torque service on a single host.
To setup a basic single-node localhost-only batch system, install the
torque-server, torque-mom, and torque-scheduler packages, and do something like
this:
0) If torque is built with munge support then this
must be enabled first on all nodes. The munge
package should allready be installed.
Create a munge key with
/usr/sbin/create-munge-key
Copy resulting key /etc/munge/munge.key to
all torque nodes in your cluster including
pbs_server, pbs_mom and client (qstat,qsub) nodes.
1) Get your full hostname with
# /bin/hostname --long
e.g myhost.example.org
2) Edit /etc/torque/server_name
to contain the single line
myhost.example.org
3) Edit /etc/torque/mom/config
to contain the single line
$pbsserver myhost.example.org
4) Create a torque serverdb file.
# /usr/sbin/pbs_server -D -t create
Warning this will remove any existing serverdb
file located at /var/lib/torque/server_priv/serverdb
You will have to Ctrl^C the pbs_server command, it will
only take a moment to create this file.
5) Start the pbs_server and configure it.
service pbs_server start
# qmgr -c "s s scheduling=true"
# qmgr -c "c q batch queue_type=execution"
# qmgr -c "s q batch started=true"
# qmgr -c "s q batch enabled=true"
# qmgr -c "s q batch resources_default.nodes=1"
# qmgr -c "s q batch resources_default.walltime=3600"
# qmgr -c "s s default_queue=batch"
6) Add one batch worker to your pbs_server.
# qmgr -c "c n myhost.example.org"
7) Start the pbs_mom and pbs_sched deamons.
# service pbs_mom start
# service pbs_sched start
8) Use chkconfig to start the services at boot time.
# /sbin/chkconfig pbs_mom on
# /sbin/chkconfig pbs_server on
# /sbin/chkconfig pbs_sched on
# /sbin/chkconfig munge on
9) Submit a test job.
As a user not as root run the following
$ qsub <<EOF
hostname
echo "Hi I am a batch job running in torque"
EOF
10 ) Monitor the state of that job with qstat.
In case of problems first of all look in /var/log/torque

2
config Normal file
View File

@ -0,0 +1,2 @@
# Configuration for pbs_mom.
$pbsserver localhost

View File

@ -1 +1,2 @@
aa033adc22df8ab333e5014dd93754b6 torque-2.5.7.tar.gz
3dd4348f54ba236ee7c208cc6b97f674 torque-4.2.8.tar.gz
541f58ab46166e86d7a468500be3fa4d torque-4.2.10.tar.gz

View File

@ -0,0 +1,173 @@
diff -uNr torque-2.5.5.ORIG/src/lib/Libnet/get_hostaddr.c torque-2.5.5/src/lib/Libnet/get_hostaddr.c
--- torque-2.5.5.ORIG/src/lib/Libnet/get_hostaddr.c 2011-06-08 18:40:00.251913002 +0200
+++ torque-2.5.5/src/lib/Libnet/get_hostaddr.c 2011-06-08 18:41:06.651911946 +0200
@@ -147,7 +147,8 @@
if (hp == NULL)
{
- sprintf(log_buffer,"cannot resolve IP address for host '%s' herror=%d: %s",
+ snprintf(log_buffer, sizeof(log_buffer),
+ "cannot resolve IP address for host '%s' herror=%d: %s",
hostname,
h_errno,
hstrerror(h_errno));
diff -uNr torque-2.5.5.ORIG/src/server/req_quejob.c torque-2.5.5/src/server/req_quejob.c
--- torque-2.5.5.ORIG/src/server/req_quejob.c 2011-06-08 18:40:00.315913002 +0200
+++ torque-2.5.5/src/server/req_quejob.c 2011-06-08 18:49:36.449912391 +0200
@@ -1053,17 +1053,19 @@
{
if (errno == 0)
{
- sprintf(log_buffer, "job %s in unexpected state '%s'",
- pj->ji_qs.ji_jobid,
- PJobSubState[pj->ji_qs.ji_substate]);
+ snprintf(log_buffer, sizeof(log_buffer),
+ "job %s in unexpected state '%s'",
+ pj->ji_qs.ji_jobid,
+ PJobSubState[pj->ji_qs.ji_substate]);
}
else
{
- sprintf(log_buffer, "job %s in unexpected state '%s' (errno=%d - %s)",
- pj->ji_qs.ji_jobid,
- PJobSubState[pj->ji_qs.ji_substate],
- errno,
- strerror(errno));
+ snprintf(log_buffer, sizeof(log_buffer),
+ "job %s in unexpected state '%s' (errno=%d - %s)",
+ pj->ji_qs.ji_jobid,
+ PJobSubState[pj->ji_qs.ji_substate],
+ errno,
+ strerror(errno));
}
log_err(errno, id, log_buffer);
@@ -1264,9 +1266,10 @@
if (LOGLEVEL >= 6)
{
- sprintf(log_buffer, "successfully moved file '%s' for job '%s'",
- namebuf,
- preq->rq_ind.rq_jobfile.rq_jobid);
+ snprintf(log_buffer, sizeof(log_buffer),
+ "successfully moved file '%s' for job '%s'",
+ namebuf,
+ preq->rq_ind.rq_jobfile.rq_jobid);
log_record(
PBSEVENT_JOB,
@@ -1382,9 +1385,11 @@
{
char tmpLine[1024];
- sprintf(tmpLine, "cannot save job - errno=%d - %s",
- errno,
- strerror(errno));
+ snprintf(tmpLine, sizeof(tmpLine),
+ "cannot save job - errno=%d - %s",
+ errno,
+ strerror(errno));
+
log_err(errno, id, tmpLine);
@@ -1408,9 +1413,11 @@
{
/* reply failed, purge the job and close the connection */
- sprintf(log_buffer, "cannot report jobid - errno=%d - %s",
- errno,
- strerror(errno));
+ snprintf(log_buffer, sizeof(log_buffer),
+ "cannot report jobid - errno=%d - %s",
+ errno,
+ strerror(errno));
+
log_err(errno, id, log_buffer);
@@ -1700,11 +1707,12 @@
/* need to format message first, before request goes away */
- sprintf(log_buffer, msg_jobnew,
- preq->rq_user, preq->rq_host,
- pj->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str,
- pj->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str,
- pj->ji_qhdr->qu_qs.qu_name);
+ snprintf(log_buffer, sizeof(log_buffer),
+ msg_jobnew,
+ preq->rq_user, preq->rq_host,
+ pj->ji_wattr[JOB_ATR_job_owner].at_val.at_str,
+ pj->ji_wattr[JOB_ATR_jobname].at_val.at_str,
+ pj->ji_qhdr->qu_qs.qu_name);
/* acknowledge the request with the job id */
@@ -1739,8 +1747,10 @@
{
if (LOGLEVEL >= 7)
{
- sprintf(log_buffer, "Trying to AUTORUN job %s",
- pj->ji_qs.ji_jobid);
+ snprintf(log_buffer, sizeof(log_buffer),
+ "Trying to AUTORUN job %s",
+ pj->ji_qs.ji_jobid);
+
log_record(
PBSEVENT_JOB,
PBS_EVENTCLASS_JOB,
@@ -1861,7 +1871,7 @@
if (!user_account_read_user(arguser))
{
- sprintf(log_buffer, "user_account_verify(%s, %s) -> USER NOT FOUND",
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_verify(%s, %s) -> USER NOT FOUND",
arguser,
argaccount);
@@ -1872,7 +1882,7 @@
{
if (strcmp(argaccount, UserAcct.ActAdr[i]) == 0)
{
- sprintf(log_buffer, "user_account_verify(%s, %s) -> SUCCESS",
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_verify(%s, %s) -> SUCCESS",
arguser,
argaccount);
@@ -1882,7 +1892,7 @@
}
} /* END for (i) */
- sprintf(log_buffer, "user_account_verify(%s, %s) -> FAILED",
+ snprintf(log_buffer, sizeof(log_buffer) "user_account_verify(%s, %s) -> FAILED",
arguser,
argaccount);
@@ -1909,7 +1919,7 @@
if (!user_account_read_user(arguser))
{
- sprintf(log_buffer, "user_account_default(%s) = USER NOT FOUND",
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_default(%s) = USER NOT FOUND",
arguser);
goto user_account_default_done;
@@ -1917,7 +1927,7 @@
if (UserAcct.ActCnt < 1)
{
- sprintf(log_buffer, "user_account_default(%s) = NO PROJECT FOUND",
+ snprintf(log_buffer, sizeof(log_buffer), "user_account_default(%s) = NO PROJECT FOUND",
arguser);
goto user_account_default_done;
@@ -1925,7 +1935,7 @@
rc = UserAcct.ActAdr[0];
- sprintf(log_buffer, "user_account_default(%s) = %s",
+ snprintf(log_buffer, sizeof(log_buffer), "user_account_default(%s) = %s",
arguser,
rc);

View File

@ -1,7 +1,17 @@
diff -uNr torque-3.0.1.ORIG/src/include/libpbs.h torque-3.0.1/src/include/libpbs.h
--- torque-3.0.1.ORIG/src/include/libpbs.h 2011-06-17 19:19:32.984380003 +0200
+++ torque-3.0.1/src/include/libpbs.h 2011-06-17 19:23:19.406379620 +0200
@@ -117,7 +117,7 @@
From 12a8d7dde1d07aed670f0dd50b317b256daaa991 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ha=C3=AFkel=20Gu=C3=A9mar?= <hguemar@fedoraproject.org>
Date: Sun, 12 Jan 2014 11:42:32 +0100
Subject: [PATCH] munge size fix
---
src/include/libpbs.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/include/libpbs.h b/src/include/libpbs.h
index 6d32c8b..06b1bec 100644
--- a/src/include/libpbs.h
+++ b/src/include/libpbs.h
@@ -121,7 +121,7 @@
#define EOF -1
#endif
@ -9,4 +19,7 @@ diff -uNr torque-3.0.1.ORIG/src/include/libpbs.h torque-3.0.1/src/include/libpbs
+#define MUNGE_SIZE 1024 /* I do not know what the proper size of this should be. My
testing with munge shows it creates a string of 128 bytes */
/* enums for standard job files (sync w/TJobFileType[]) */
--
1.8.4.2

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,11 @@
[Desktop Entry]
Encoding=UTF-8
Name=xpbs
Name=xPBS
GenericName=PBS/TORQUE client
Comment=View job status and submit jobs
Exec=xpbs
Icon=xpbs.png
Terminal=false
Type=Application
Categories=Application;Other;
Version=1.1.12
Categories=Education;Science;ComputerScience;ParallelComputing;
Version=1.0

View File

@ -1,11 +1,11 @@
[Desktop Entry]
Encoding=UTF-8
Name=xpbsmon
Name=xPBSMon
GenericName=PBS/TORQUE cluster monitor
Comment=View node status
Exec=xpbsmon
Icon=xpbsmon.png
Terminal=false
Type=Application
Categories=Application;Other;
Version=2.3
Categories=Education;Science;ComputerScience;ParallelComputing;
Version=1.0