From: Daniel G. <dg...@ti...> - 2004-03-18 04:13:25
|
Thanks, Erik. It works just fine. The issue with the interactive jobs is also fixed. I guess it is worth at least a minor revision number... :-) Regards, Daniel On Wed, Mar 17, 2004 at 03:22:23PM -0700, er...@he... wrote: > On Wed, Mar 17, 2004 at 04:45:55PM -0500, Daniel Gruner wrote: > > Hi Erik, > > > > Would you care to actually send me your proposed diffs? I have been > > trying to make sense of your code, but... :-) > > Ok, ok :) I had my head in BJS today anyway for another reason. It > turned out to be a little more involved than what I said below because > bjs wasn't even sending the job ID to bjssub in the interactive case. > There's a few other bug fixes in there too. See the ChangeLog part of > the diff for details on that. The environment variable is > "BJS_JOBID". Note that BJS_JOBIDs may be re-used if bjs is restarted. > > - Erik > > > On Wed, Mar 17, 2004 at 12:46:15PM -0700, er...@he... wrote: > > > On Wed, Mar 17, 2004 at 11:36:56AM -0500, Daniel Gruner wrote: > > > > HI > > > > > > > > Is there an environment variable setup by bjs such that the submitted > > > > script can know its id? I know the $NODES variable is set by bjs, but > > > > it would be useful to also have the JOBID set in the environment, so that > > > > one can set up individual directories based on the JOBID, etc. > > > > > > It doesn't. That's a good idea though. I think it could be added > > > with two lines in bjs.c:bjs_job_environment() and the analagous spot > > > (where NODES is set) in bjssub.c. > > > > > > - Erik > > > Index: ChangeLog > =================================================================== > RCS file: /home/repository/bjs/ChangeLog,v > retrieving revision 1.7 > retrieving revision 1.10 > diff -u -r1.7 -r1.10 > --- ChangeLog 10 Nov 2003 19:48:15 -0000 1.7 > +++ ChangeLog 17 Mar 2004 22:41:45 -0000 1.10 > @@ -1,3 +1,16 @@ > +Changes from 1.4 to > + > + * Fixed signal setup for batch mode jobs started by the daemons. > + Signal handling options and signal masks are now properly reset to > + defaults for the child processes. > + > + * Fixed signal handling behavior in the daemon so that the daemon > + won't get slain by SIGPIPE. > + > + * Fixed a problem with node ranges not getting assigned properly. > + > + * Added BJS_JOBID environment variable. > + > Changes from 1.3 to 1.4 > > * Updated to work reasonably on x86_64 machines. > Index: bjs.c > =================================================================== > RCS file: /home/repository/bjs/bjs.c,v > retrieving revision 1.27 > retrieving revision 1.30 > diff -u -r1.27 -r1.30 > --- bjs.c 10 Nov 2003 19:48:15 -0000 1.27 > +++ bjs.c 17 Mar 2004 22:41:45 -0000 1.30 > @@ -24,7 +24,7 @@ > * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > * License for more detail. > * > - * $Id: bjs.c,v 1.27 2003/11/10 19:48:15 hendriks Exp $ > + * $Id: bjs.c,v 1.30 2004/03/17 22:41:45 hendriks Exp $ > *--------------------------------------------------------------------*/ > #include <stdio.h> > #include <stdlib.h> > @@ -318,10 +318,10 @@ > > static struct bproc_node_set_t clean_set = BPROC_EMPTY_NODESET; > void bjs_do_clean(void) { > +#if 1 > int i, j, nprocs, killed_one; > struct bproc_proc_info_t *plist; > > - > /* XXX It would be much better if we had the option of killing > * only the processes related to the job. We're going to end up > * killing off mon, etc. here. */ > @@ -340,6 +340,9 @@ > > if (nprocs > 0) free(plist); > } while(killed_one); > +#else > +#warning "bjs_do_clean is commented out!!!!" > +#endif > > bproc_nodeset_free(&clean_set); > } > @@ -435,6 +438,9 @@ > tmp[len-1] = 0; > printf("NODES=\"%s\"\n", tmp); fflush(0); > setenv("NODES", tmp, 1); > + > + sprintf(tmp, "%d", j->job_id); > + setenv("BJS_JOBID", tmp, 1); > } > > static > @@ -524,8 +530,18 @@ > return -1; > } > if (pid == 0) { > - /* Close file descriptors for clients */ > struct list_head *l; > + sigset_t sset; > + > + /* Restore signal handling defaults */ > + signal(SIGCHLD, SIG_DFL); > + signal(SIGHUP, SIG_DFL); > + signal(SIGPIPE, SIG_IGN); > + > + sigfillset(&sset); > + sigprocmask(SIG_UNBLOCK, &sset, 0); > + > + /* Close file descriptors for clients */ > for (l=clients.next; l != &clients; l = l->next) { > struct client_t *c; > c = list_entry(l, struct client_t, list); > @@ -572,6 +588,8 @@ > char tmp[20]; > > sx = sexp_create_list("nodes", 0); > + sprintf(tmp, "%d", j->job_id); > + sexp_append_atom(sx, tmp); > for (l = j->nodes.next; l != &j->nodes; l = l->next) { > struct node_alloc_t *n = list_entry(l,struct node_alloc_t,nodes_list); > sprintf(tmp, "%d", bjs_node_idx[n->node]->node); > @@ -1617,61 +1635,6 @@ > return current_pool; > } > > -#if 0 > -static > -int do_nodelist(char *str_, int **numlist_) { > - char *end1, *end2, *next, *str; > - int num1, num2, i; > - int numlist_len = 0; > - int *numlist = 0; > - > - for (str = str_; *str; str = next) { > - /* Look for a number */ > - num1 = strtol(str, &end1, 0); > - switch (*end1) { > - case 0: > - num2 = num1; > - next = end1; > - break; > - case ',': > - num2 = num1; > - next = end1+1; > - break; > - case '-': > - num2 = strtol(end1+1, &end2, 0); > - if (end2 == end1+1) { > - if (numlist) free(numlist); > - return -1; > - } > - switch (*end2) { > - case 0: > - next = end2; > - break; > - case ',': > - next = end2+1; > - break; > - default: > - if (numlist) free(numlist); > - return -1; > - } > - break; > - default: > - if (numlist) free(numlist); > - return -1; > - } > - > - /* Fill in range */ > - numlist = realloc_chk(numlist, > - sizeof(int) * (numlist_len + num2 - num1 + 1)); > - for (i = num1; i <= num2; i++) > - numlist[numlist_len++] = i; > - } > - > - * numlist_ = numlist; > - return numlist_len; > -} > -#endif > - > static > int config_nodes_callback(struct cmconf *cnf, char **args) { > int i, j; > @@ -1694,14 +1657,12 @@ > return -1; > } > > - /*printf("%d %d %s\n", p->nnodes, ns2.size, args[i]);*/ > - > p->nodes = realloc_chk(p->nodes, sizeof(int) * (p->nnodes + ns2.size)); > for (j=0; j < ns2.size; j++) { > /* XXX Do we want to sanity check node numbers at this > * point? It seems that we need to be able to handle > * bogus node numbers. */ > - p->nodes[p->nnodes++] = ns.node[j].node; > + p->nodes[p->nnodes++] = ns2.node[j].node; > /* We handle machine setup inside config_xfer */ > } > bproc_nodeset_free(&ns); > @@ -2398,6 +2359,7 @@ > "Usage: %s [options...]\n" > " -h Print this message and exit.\n" > " -V Print version information and exit.\n" > +" -v Increase verbose level.\n" > " -C file Read configuration from file (default=%s)\n" > , arg0, DEFAULT_CONFIG_FILE); > } > @@ -2474,7 +2436,7 @@ > sigprocmask(SIG_BLOCK, &sset, 0); > signal(SIGCHLD, signal_handler); > signal(SIGHUP, signal_handler); > - //signal(SIGPIPE, SIG_IGN); > + signal(SIGPIPE, SIG_IGN); > > /*-- main select loop ------------------------------------------*/ > while (1) { > Index: bjssub.c > =================================================================== > RCS file: /home/repository/bjs/bjssub.c,v > retrieving revision 1.12 > retrieving revision 1.13 > diff -u -r1.12 -r1.13 > --- bjssub.c 19 Sep 2002 20:28:20 -0000 1.12 > +++ bjssub.c 17 Mar 2004 22:41:45 -0000 1.13 > @@ -24,7 +24,7 @@ > * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > * License for more detail. > * > - * $Id: bjssub.c,v 1.12 2002/09/19 20:28:20 hendriks Exp $ > + * $Id: bjssub.c,v 1.13 2004/03/17 22:41:45 hendriks Exp $ > *--------------------------------------------------------------------*/ > #include <stdio.h> > #include <stdlib.h> > @@ -292,16 +292,19 @@ > > /* Put together the nodes string and stick it in the environment */ > len = 0; > - for (sx = nodesx->list->next; sx; sx=sx->next) > + for (sx = nodesx->list->next->next; sx; sx=sx->next) > len += strlen(sx->val) + 1; > nodesstr = alloca(len); > nodesstr[0] = 0; > - for (sx = nodesx->list->next; sx; sx=sx->next) { > + for (sx = nodesx->list->next->next; sx; sx=sx->next) { > strcat(nodesstr, sx->val); > if (sx->next) strcat(nodesstr, ","); > } > setenv("NODES", nodesstr, 1); > printf("NODES=%s\n", nodesstr); > + > + setenv("BJS_JOBID", nodesx->list->next->val, 1); > + printf("BJS_JOBID=%s\n", nodesx->list->next->val); > > if (pwd && chdir(pwd)) { > fprintf(stderr, "chdir(\"%s\"): %s\n", pwd, strerror(errno)); > Index: sexps.txt > =================================================================== > RCS file: /home/repository/bjs/sexps.txt,v > retrieving revision 1.3 > retrieving revision 1.4 > diff -u -r1.3 -r1.4 > --- sexps.txt 17 Sep 2002 03:55:10 -0000 1.3 > +++ sexps.txt 17 Mar 2004 22:41:45 -0000 1.4 > @@ -20,7 +20,7 @@ > > Job submission (interactive) responses: > (ok ID) > -(nodes NODE ...) > +(nodes ID NODE ...) > (error MSG) > > -- Dr. Daniel Gruner dg...@ti... Dept. of Chemistry dan...@ut... University of Toronto phone: (416)-978-8689 80 St. George Street fax: (416)-978-5325 Toronto, ON M5S 3H6, Canada finger for PGP public key |