Merge branch 'master' into el5
Conflicts: .gitignore sources torque-munge-size.patch torque.spec
This commit is contained in:
commit
72f042a527
16
.gitignore
vendored
16
.gitignore
vendored
@ -1,4 +1,14 @@
|
|||||||
torque-2.3.10.tar.gz
|
/torque-2.5.2.tar.gz
|
||||||
/torque-2.3.12.tar.gz
|
/torque-2.5.3.tar.gz
|
||||||
/torque-2.3.13.tar.gz
|
/torque-2.5.4.tar.gz
|
||||||
|
/torque-2.5.5.tar.gz
|
||||||
/torque-2.5.7.tar.gz
|
/torque-2.5.7.tar.gz
|
||||||
|
/torque-3.0.0.tar.gz
|
||||||
|
/torque-3.0.0-snap.201102011355.tar.gz
|
||||||
|
/torque-3.0.1.tar.gz
|
||||||
|
/torque-3.0.2.tar.gz
|
||||||
|
/torque-3.0.3.tar.gz
|
||||||
|
/torque-3.0.4.tar.gz
|
||||||
|
/torque-4.2.6.1.tar.gz
|
||||||
|
/torque-4.2.8.tar.gz
|
||||||
|
/torque-4.2.10.tar.gz
|
||||||
|
@ -1,20 +0,0 @@
|
|||||||
To setup a basic single-node localhost-only batch system, install the
|
|
||||||
torque-server, torque-mom, and torque-scheduler packages, and do something like
|
|
||||||
this:
|
|
||||||
|
|
||||||
/sbin/chkconfig pbs_mom on
|
|
||||||
/sbin/chkconfig pbs_server on
|
|
||||||
/sbin/chkconfig pbs_sched on
|
|
||||||
/bin/hostname --long > %{torquehomedir}/server_priv/nodes
|
|
||||||
/bin/hostname --long > %{torquehomedir}/server_name
|
|
||||||
service pbs_server start
|
|
||||||
qmgr -c "s s scheduling=true"
|
|
||||||
qmgr -c "c q batch queue_type=execution"
|
|
||||||
qmgr -c "s q batch started=true"
|
|
||||||
qmgr -c "s q batch enabled=true"
|
|
||||||
qmgr -c "s q batch resources_default.nodes=1"
|
|
||||||
qmgr -c "s q batch resources_default.walltime=3600"
|
|
||||||
qmgr -c "s s default_queue=batch"
|
|
||||||
service pbs_mom restart
|
|
||||||
service pbs_sched restart
|
|
||||||
|
|
84
README.Fedora
Normal file
84
README.Fedora
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
This README describes how to get the most basic working
|
||||||
|
torque service on a single host.
|
||||||
|
|
||||||
|
To setup a basic single-node localhost-only batch system, install the
|
||||||
|
torque-server, torque-mom, and torque-scheduler packages, and do something like
|
||||||
|
this:
|
||||||
|
|
||||||
|
0) If torque is built with munge support then this
|
||||||
|
must be enabled first on all nodes. The munge
|
||||||
|
package should allready be installed.
|
||||||
|
|
||||||
|
Create a munge key with
|
||||||
|
|
||||||
|
/usr/sbin/create-munge-key
|
||||||
|
|
||||||
|
Copy resulting key /etc/munge/munge.key to
|
||||||
|
all torque nodes in your cluster including
|
||||||
|
pbs_server, pbs_mom and client (qstat,qsub) nodes.
|
||||||
|
|
||||||
|
1) Get your full hostname with
|
||||||
|
|
||||||
|
# /bin/hostname --long
|
||||||
|
|
||||||
|
e.g myhost.example.org
|
||||||
|
|
||||||
|
2) Edit /etc/torque/server_name
|
||||||
|
to contain the single line
|
||||||
|
|
||||||
|
myhost.example.org
|
||||||
|
|
||||||
|
3) Edit /etc/torque/mom/config
|
||||||
|
to contain the single line
|
||||||
|
|
||||||
|
$pbsserver myhost.example.org
|
||||||
|
|
||||||
|
4) Create a torque serverdb file.
|
||||||
|
# /usr/sbin/pbs_server -D -t create
|
||||||
|
|
||||||
|
Warning this will remove any existing serverdb
|
||||||
|
file located at /var/lib/torque/server_priv/serverdb
|
||||||
|
|
||||||
|
You will have to Ctrl^C the pbs_server command, it will
|
||||||
|
only take a moment to create this file.
|
||||||
|
|
||||||
|
5) Start the pbs_server and configure it.
|
||||||
|
service pbs_server start
|
||||||
|
# qmgr -c "s s scheduling=true"
|
||||||
|
# qmgr -c "c q batch queue_type=execution"
|
||||||
|
# qmgr -c "s q batch started=true"
|
||||||
|
# qmgr -c "s q batch enabled=true"
|
||||||
|
# qmgr -c "s q batch resources_default.nodes=1"
|
||||||
|
# qmgr -c "s q batch resources_default.walltime=3600"
|
||||||
|
# qmgr -c "s s default_queue=batch"
|
||||||
|
|
||||||
|
6) Add one batch worker to your pbs_server.
|
||||||
|
|
||||||
|
# qmgr -c "c n myhost.example.org"
|
||||||
|
|
||||||
|
7) Start the pbs_mom and pbs_sched deamons.
|
||||||
|
|
||||||
|
# service pbs_mom start
|
||||||
|
# service pbs_sched start
|
||||||
|
|
||||||
|
8) Use chkconfig to start the services at boot time.
|
||||||
|
|
||||||
|
# /sbin/chkconfig pbs_mom on
|
||||||
|
# /sbin/chkconfig pbs_server on
|
||||||
|
# /sbin/chkconfig pbs_sched on
|
||||||
|
# /sbin/chkconfig munge on
|
||||||
|
|
||||||
|
9) Submit a test job.
|
||||||
|
As a user not as root run the following
|
||||||
|
|
||||||
|
$ qsub <<EOF
|
||||||
|
hostname
|
||||||
|
echo "Hi I am a batch job running in torque"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
10 ) Monitor the state of that job with qstat.
|
||||||
|
|
||||||
|
In case of problems first of all look in /var/log/torque
|
||||||
|
|
||||||
|
|
||||||
|
|
3
sources
3
sources
@ -1 +1,2 @@
|
|||||||
aa033adc22df8ab333e5014dd93754b6 torque-2.5.7.tar.gz
|
3dd4348f54ba236ee7c208cc6b97f674 torque-4.2.8.tar.gz
|
||||||
|
541f58ab46166e86d7a468500be3fa4d torque-4.2.10.tar.gz
|
||||||
|
173
torque-buffer-overrun-2.5.5.patch
Normal file
173
torque-buffer-overrun-2.5.5.patch
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
diff -uNr torque-2.5.5.ORIG/src/lib/Libnet/get_hostaddr.c torque-2.5.5/src/lib/Libnet/get_hostaddr.c
|
||||||
|
--- torque-2.5.5.ORIG/src/lib/Libnet/get_hostaddr.c 2011-06-08 18:40:00.251913002 +0200
|
||||||
|
+++ torque-2.5.5/src/lib/Libnet/get_hostaddr.c 2011-06-08 18:41:06.651911946 +0200
|
||||||
|
@@ -147,7 +147,8 @@
|
||||||
|
|
||||||
|
if (hp == NULL)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer,"cannot resolve IP address for host '%s' herror=%d: %s",
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "cannot resolve IP address for host '%s' herror=%d: %s",
|
||||||
|
hostname,
|
||||||
|
h_errno,
|
||||||
|
hstrerror(h_errno));
|
||||||
|
diff -uNr torque-2.5.5.ORIG/src/server/req_quejob.c torque-2.5.5/src/server/req_quejob.c
|
||||||
|
--- torque-2.5.5.ORIG/src/server/req_quejob.c 2011-06-08 18:40:00.315913002 +0200
|
||||||
|
+++ torque-2.5.5/src/server/req_quejob.c 2011-06-08 18:49:36.449912391 +0200
|
||||||
|
@@ -1053,17 +1053,19 @@
|
||||||
|
{
|
||||||
|
if (errno == 0)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "job %s in unexpected state '%s'",
|
||||||
|
- pj->ji_qs.ji_jobid,
|
||||||
|
- PJobSubState[pj->ji_qs.ji_substate]);
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "job %s in unexpected state '%s'",
|
||||||
|
+ pj->ji_qs.ji_jobid,
|
||||||
|
+ PJobSubState[pj->ji_qs.ji_substate]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "job %s in unexpected state '%s' (errno=%d - %s)",
|
||||||
|
- pj->ji_qs.ji_jobid,
|
||||||
|
- PJobSubState[pj->ji_qs.ji_substate],
|
||||||
|
- errno,
|
||||||
|
- strerror(errno));
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "job %s in unexpected state '%s' (errno=%d - %s)",
|
||||||
|
+ pj->ji_qs.ji_jobid,
|
||||||
|
+ PJobSubState[pj->ji_qs.ji_substate],
|
||||||
|
+ errno,
|
||||||
|
+ strerror(errno));
|
||||||
|
}
|
||||||
|
|
||||||
|
log_err(errno, id, log_buffer);
|
||||||
|
@@ -1264,9 +1266,10 @@
|
||||||
|
|
||||||
|
if (LOGLEVEL >= 6)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "successfully moved file '%s' for job '%s'",
|
||||||
|
- namebuf,
|
||||||
|
- preq->rq_ind.rq_jobfile.rq_jobid);
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "successfully moved file '%s' for job '%s'",
|
||||||
|
+ namebuf,
|
||||||
|
+ preq->rq_ind.rq_jobfile.rq_jobid);
|
||||||
|
|
||||||
|
log_record(
|
||||||
|
PBSEVENT_JOB,
|
||||||
|
@@ -1382,9 +1385,11 @@
|
||||||
|
{
|
||||||
|
char tmpLine[1024];
|
||||||
|
|
||||||
|
- sprintf(tmpLine, "cannot save job - errno=%d - %s",
|
||||||
|
- errno,
|
||||||
|
- strerror(errno));
|
||||||
|
+ snprintf(tmpLine, sizeof(tmpLine),
|
||||||
|
+ "cannot save job - errno=%d - %s",
|
||||||
|
+ errno,
|
||||||
|
+ strerror(errno));
|
||||||
|
+
|
||||||
|
|
||||||
|
log_err(errno, id, tmpLine);
|
||||||
|
|
||||||
|
@@ -1408,9 +1413,11 @@
|
||||||
|
{
|
||||||
|
/* reply failed, purge the job and close the connection */
|
||||||
|
|
||||||
|
- sprintf(log_buffer, "cannot report jobid - errno=%d - %s",
|
||||||
|
- errno,
|
||||||
|
- strerror(errno));
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "cannot report jobid - errno=%d - %s",
|
||||||
|
+ errno,
|
||||||
|
+ strerror(errno));
|
||||||
|
+
|
||||||
|
|
||||||
|
log_err(errno, id, log_buffer);
|
||||||
|
|
||||||
|
@@ -1700,11 +1707,12 @@
|
||||||
|
|
||||||
|
/* need to format message first, before request goes away */
|
||||||
|
|
||||||
|
- sprintf(log_buffer, msg_jobnew,
|
||||||
|
- preq->rq_user, preq->rq_host,
|
||||||
|
- pj->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str,
|
||||||
|
- pj->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str,
|
||||||
|
- pj->ji_qhdr->qu_qs.qu_name);
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ msg_jobnew,
|
||||||
|
+ preq->rq_user, preq->rq_host,
|
||||||
|
+ pj->ji_wattr[JOB_ATR_job_owner].at_val.at_str,
|
||||||
|
+ pj->ji_wattr[JOB_ATR_jobname].at_val.at_str,
|
||||||
|
+ pj->ji_qhdr->qu_qs.qu_name);
|
||||||
|
|
||||||
|
/* acknowledge the request with the job id */
|
||||||
|
|
||||||
|
@@ -1739,8 +1747,10 @@
|
||||||
|
{
|
||||||
|
if (LOGLEVEL >= 7)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "Trying to AUTORUN job %s",
|
||||||
|
- pj->ji_qs.ji_jobid);
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer),
|
||||||
|
+ "Trying to AUTORUN job %s",
|
||||||
|
+ pj->ji_qs.ji_jobid);
|
||||||
|
+
|
||||||
|
log_record(
|
||||||
|
PBSEVENT_JOB,
|
||||||
|
PBS_EVENTCLASS_JOB,
|
||||||
|
@@ -1861,7 +1871,7 @@
|
||||||
|
|
||||||
|
if (!user_account_read_user(arguser))
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "user_account_verify(%s, %s) -> USER NOT FOUND",
|
||||||
|
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_verify(%s, %s) -> USER NOT FOUND",
|
||||||
|
arguser,
|
||||||
|
argaccount);
|
||||||
|
|
||||||
|
@@ -1872,7 +1882,7 @@
|
||||||
|
{
|
||||||
|
if (strcmp(argaccount, UserAcct.ActAdr[i]) == 0)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "user_account_verify(%s, %s) -> SUCCESS",
|
||||||
|
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_verify(%s, %s) -> SUCCESS",
|
||||||
|
arguser,
|
||||||
|
argaccount);
|
||||||
|
|
||||||
|
@@ -1882,7 +1892,7 @@
|
||||||
|
}
|
||||||
|
} /* END for (i) */
|
||||||
|
|
||||||
|
- sprintf(log_buffer, "user_account_verify(%s, %s) -> FAILED",
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer) "user_account_verify(%s, %s) -> FAILED",
|
||||||
|
arguser,
|
||||||
|
argaccount);
|
||||||
|
|
||||||
|
@@ -1909,7 +1919,7 @@
|
||||||
|
|
||||||
|
if (!user_account_read_user(arguser))
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "user_account_default(%s) = USER NOT FOUND",
|
||||||
|
+ snprintf(log_buffer,sizeof(log_buffer), "user_account_default(%s) = USER NOT FOUND",
|
||||||
|
arguser);
|
||||||
|
|
||||||
|
goto user_account_default_done;
|
||||||
|
@@ -1917,7 +1927,7 @@
|
||||||
|
|
||||||
|
if (UserAcct.ActCnt < 1)
|
||||||
|
{
|
||||||
|
- sprintf(log_buffer, "user_account_default(%s) = NO PROJECT FOUND",
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer), "user_account_default(%s) = NO PROJECT FOUND",
|
||||||
|
arguser);
|
||||||
|
|
||||||
|
goto user_account_default_done;
|
||||||
|
@@ -1925,7 +1935,7 @@
|
||||||
|
|
||||||
|
rc = UserAcct.ActAdr[0];
|
||||||
|
|
||||||
|
- sprintf(log_buffer, "user_account_default(%s) = %s",
|
||||||
|
+ snprintf(log_buffer, sizeof(log_buffer), "user_account_default(%s) = %s",
|
||||||
|
arguser,
|
||||||
|
rc);
|
||||||
|
|
@ -1,7 +1,17 @@
|
|||||||
diff -uNr torque-3.0.1.ORIG/src/include/libpbs.h torque-3.0.1/src/include/libpbs.h
|
From 12a8d7dde1d07aed670f0dd50b317b256daaa991 Mon Sep 17 00:00:00 2001
|
||||||
--- torque-3.0.1.ORIG/src/include/libpbs.h 2011-06-17 19:19:32.984380003 +0200
|
From: =?UTF-8?q?Ha=C3=AFkel=20Gu=C3=A9mar?= <hguemar@fedoraproject.org>
|
||||||
+++ torque-3.0.1/src/include/libpbs.h 2011-06-17 19:23:19.406379620 +0200
|
Date: Sun, 12 Jan 2014 11:42:32 +0100
|
||||||
@@ -117,7 +117,7 @@
|
Subject: [PATCH] munge size fix
|
||||||
|
|
||||||
|
---
|
||||||
|
src/include/libpbs.h | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/src/include/libpbs.h b/src/include/libpbs.h
|
||||||
|
index 6d32c8b..06b1bec 100644
|
||||||
|
--- a/src/include/libpbs.h
|
||||||
|
+++ b/src/include/libpbs.h
|
||||||
|
@@ -121,7 +121,7 @@
|
||||||
#define EOF -1
|
#define EOF -1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -9,4 +19,7 @@ diff -uNr torque-3.0.1.ORIG/src/include/libpbs.h torque-3.0.1/src/include/libpbs
|
|||||||
+#define MUNGE_SIZE 1024 /* I do not know what the proper size of this should be. My
|
+#define MUNGE_SIZE 1024 /* I do not know what the proper size of this should be. My
|
||||||
testing with munge shows it creates a string of 128 bytes */
|
testing with munge shows it creates a string of 128 bytes */
|
||||||
|
|
||||||
/* enums for standard job files (sync w/TJobFileType[]) */
|
|
||||||
|
--
|
||||||
|
1.8.4.2
|
||||||
|
|
||||||
|
1221
torque.spec
1221
torque.spec
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,11 @@
|
|||||||
[Desktop Entry]
|
[Desktop Entry]
|
||||||
Encoding=UTF-8
|
Encoding=UTF-8
|
||||||
Name=xpbs
|
Name=xPBS
|
||||||
GenericName=PBS/TORQUE client
|
GenericName=PBS/TORQUE client
|
||||||
Comment=View job status and submit jobs
|
Comment=View job status and submit jobs
|
||||||
Exec=xpbs
|
Exec=xpbs
|
||||||
Icon=xpbs.png
|
Icon=xpbs.png
|
||||||
Terminal=false
|
Terminal=false
|
||||||
Type=Application
|
Type=Application
|
||||||
Categories=Application;Other;
|
Categories=Education;Science;ComputerScience;ParallelComputing;
|
||||||
Version=1.1.12
|
Version=1.0
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
[Desktop Entry]
|
[Desktop Entry]
|
||||||
Encoding=UTF-8
|
Encoding=UTF-8
|
||||||
Name=xpbsmon
|
Name=xPBSMon
|
||||||
GenericName=PBS/TORQUE cluster monitor
|
GenericName=PBS/TORQUE cluster monitor
|
||||||
Comment=View node status
|
Comment=View node status
|
||||||
Exec=xpbsmon
|
Exec=xpbsmon
|
||||||
Icon=xpbsmon.png
|
Icon=xpbsmon.png
|
||||||
Terminal=false
|
Terminal=false
|
||||||
Type=Application
|
Type=Application
|
||||||
Categories=Application;Other;
|
Categories=Education;Science;ComputerScience;ParallelComputing;
|
||||||
Version=2.3
|
Version=1.0
|
||||||
|
Loading…
Reference in New Issue
Block a user