Infiniband support for NMon

Help
2013-04-30
2015-03-13
  • Denis Cerkvin
    Denis Cerkvin
    2013-04-30

    Hi,

    I have made some changes in NMon 14g so it may monitor Infiniband usage.
    Also I modified the behaviour of "show_net" patrt, so "ncurses" may switch scale from KB/s to MB/s for any network interface separately and properly clear the area after "0" has been pressed.

    This patch is quite bulky, I tried to add it to the "patches" part of the site, but I guess I need to be a project developer to do so.

    If you want to get patch, "schema" file and compiled binary for linux - I have it on my website too.
    Of course, it is just for the convenience - there is no other "nmon" site, but this one.

    Thanks, guys!


    linux]$ cat infiniband.patch
    -- lmon14g.c 2013-04-15 08:54:09.000000000 +1000
    +++ lmon14g_reat_DC.c 2013-04-29 09:58:48.078942778 +1000
    @@ -1,16 +1,33 @@
    /*
      * lmon.c - Curses based Performance Monitor for Linux
      * Developer: Nigel Griffiths.
    + * ====================================================
    + *
    + *
    + * ------------------------------------------
    + * Linux-only specific additions, April 2013:
    + * - correct CPU idle figures on SMP linux systems;
    + * - new Env. Vars. to exclude duplicated or not needed:
    + *   - disks
    + *   - network interfaces
    + * - added traffic accounting for Infiniband Networks (IB / RDMA);
    + * - new Env. Var., listing the file to read IB counters from.
    + * =================================================
    + * Sincerely,
    + * Denis Cerkvin
    + *
    + * The Bible for command line people.
    + * http://www.read-and-think.org/kjv.html
    + *
    + * Библия для людей, работающих с командной строкой.
    + * http://www.read-and-think.org/ 
    + * =================================================
    + * ------------------------------------------
      */

    -/*
    - * Use the following Makefile (for Linux on POWER)
    -CFLAGS=-g -D JFS -D GETUSER -Wall -D LARGEMEM -D POWER
    -LDFLAGS=-lcurses
    -nmon: lnmon.o
    - * end of Makefile
    - */
    -/* #define POWER 1 */
    +/* #define POWER 0
    + * DO NOT use this version of the program on anything other than Linix.
    + * */
    /* #define KERNEL_2_6_18 1 */
    /* This adds the following to the disk stats
    pi_num_threads,
    @@ -85,7 +102,6 @@
    #define REALLOC(argument1,argument2)    realloc(argument1,argument2)
    #endif /* MALLOC STUFF */

    • #define P_CPUINFO 0
      #define P_STAT 1
      #define P_VERSION 2
      @@ -156,9 +172,31 @@
      int read_this_interval; /* track updates for each update to stop  double data collection */
      } proc;

    -void proc_init()
    +/* Denis Cerkvin:
    + * I realise that I am processing Env.Variables (exclusion lists and IB)
    + * on each loop's iteration for each disk / interface, and on every main
    + * loop iteraion. I considered moving Env.Vars to the start of main
    + * program, into "proc_init" and decided not to do it. This NMON program
    + * tends to keep all pieces of data / environments / defines as close as
    + * possible to the processing code, so this "local" approach fits better.
    + */
    +
    +void proc_init()
    {
    int i;
    +char *envib = getenv("NMON_IB_COUNTERS");;
    +
    + // Resetting IB counters
    + if (envib) { // IB path has been defined.
    + // DEBUG
    + // fprintf(stderr, "Found IB devices in NMON_IB_COUNTERS.\n");
    + if (system("perfquery -r") == 0)
    +   fprintf(stderr, "IB performance counters have been cleared.\n");
    + else
    +   fprintf(stderr, "Could NOT clear IB performance counters!\n");
    + } else
    + fprintf(stderr, "No IB devices found in NMON_IB_COUNTERS.\n");
    +
    /* Initialise the file pointers */
    for(i=0;i<P_NUMBER;i++) {
    proc_.fp = 0;
    @@ -652,8 +690,9 @@
    };

    #define ulong unsigned long
    +#define DK_NAME_MAX_LEN 32
    struct dsk_stat {
    - char dk_name;
    + char dk_name;
    int dk_major;
    int dk_minor;
    long dk_noinfo;
    @@ -794,8 +833,9 @@
    };

    #define NETMAX 32
    +#define IF_NAME_MAX_LEN 17
    struct net_stat {
    - unsigned long if_name;
    + unsigned long if_name;
    unsigned long long if_ibytes;
    unsigned long long if_obytes;
    unsigned long long if_ipackets;
    @@ -1535,7 +1575,8 @@
    p->cpu_total.uptime=atof(proc.line);
    for(i=0;i<strlen(proc.line);i++) {
    if(proc.line == ' ') {
    - p->cpu_total.idletime=atof(&proc.line);
    +// Used to be p->cpu_total.idletime=atof(&proc.line);
    + p->cpu_total.idletime=atof(&proc.line)/cpus;
    break;
    }
    }
    @@ -1617,6 +1658,7 @@
    {
    static FILE *fp = (FILE *)-1;
    char buf;
    +char *envp; // Env. Var with names of disks.
    int i;
    int ret;

    @@ -1639,7 +1681,15 @@
        8    0 sda 990 2325 4764 6860 9 3 12 417 0 6003 7277
        8    1 sda1 3264 4356 12 12
    */
    +
    + // Getting the pointer to the string with names of excluded disks.
    + envp=getenv("NMON_EXCLUDE_DISKS");
    + // Here we account for space separated list of quoted DISKMAX devices.
    + if (envp && (strlen(envp) > (DK_NAME_MAX_LEN+3)*DISKMAX-2) ) // Something is wrong with Env. Var.
    + envp=NULL;
    +
    for(i=0;i<DISKMAX;) {
    +
    if(fgets(buf,1024,fp) == NULL)
    break;
    /* zero the data ready for reading */
    @@ -1673,6 +1723,7 @@
    &p->dk.dk_inflight,
    &p->dk.dk_time,
    &p->dk.dk_11 );
    +
    if(ret == 7) { /* shuffle the data around due to missing columns for partitions */
    p->dk.dk_partition = 1;
    p->dk.dk_wkb = p->dk.dk_rmsec;
    @@ -1686,6 +1737,19 @@
    else fprintf(stderr,"disk sscanf wanted 14 but returned=%d line=%s\n",
    ret,buf);

    + if(envp) { // Got disks to exclude!
    +                // export NMON_EXCLUDE_DISKS="'dm-0' 'dm-1' 'sda1' 'sda2' 'sda'"
    +                // export NMON_EXCLUDE_DISKS="'cciss/c0d0p1' 'cciss/c0d0'"
    +                 sprintf(buf, "'%s'", &p->dk.dk_name); // 'sda'
    +                 // Is this disk one of them?
    +                  if ( strstr(envp, buf) ) {
    +                   // DEBUG fprintf(stderr,"skipping disk '%s' found in %s\n",
    +                   //  &p->dk.dk_name, envp);
    +                   continue;
    +    // No need to decrement "i" here, see loop condition and below.
    +                  }
    +                } // No excluded disks.
    +
    p->dk.dk_rkb /= 2; /* sectors = 512 bytes */
    p->dk.dk_wkb /= 2;
    p->dk.dk_xfers = p->dk.dk_reads + p->dk.dk_writes;
    @@ -3003,18 +3067,117 @@
    show_vm   = 1;
    }

    +#define BUFLEN 1024 // For IB Env.Variables and files processing.
    +
    +long unsigned int read_ib_counter_file(
    + char *counter_path, char *counter_name)
    +{
    + long int res=0, ibvalue=0;
    + char *fname;
    + FILE *ib_counter_file=NULL;
    +
    +sprintf(fname, "%s/%s\0", counter_path, counter_name);
    + if (ib_counter_file = fopen(fname, "r")) {
    +   if (fscanf(ib_counter_file, "%lu", &ibvalue) != 1)
    +     fprintf(stderr,"RDMA:ib - wrong number in %s counter\n", fname);
    +   else {
    +    res= 4 * ibvalue;
    +   }
    + } else
    +   fprintf(stderr,"RDMA:ib - failed to open %s counter\n", fname);
    +
    +if (ib_counter_file)
    + fclose(ib_counter_file); // Always close the file.
    +return res;
    +}
    +
    +void read_ib_counter( int ib_index, char *counter_path) {
    +/* This procedure reads following "unsigned long" counters
    + * for single IB inerface:
    + * - port_rcv_data - number of recieved 4-bytes word, *not* bytes.
    + * - port_xmit_data - number of transmitted 4-bytes word, *not* bytes.
    + * If any of "data" counters above shows 4294967295, it has "filled up".
    + * - port_rcv_packets - number of recieved IB/RDMA packets 
    + * - port_xmit_packets - number of sent IB/RDMA packets 
    + * - port_rcv_errors - number of errors while recieving 
    + * I mapped these below "freely":
    + * - port_rcv_remote_physical_errors - number of recieve "drops"
    + * - port_xmit_constraint_errors  -transmission errors
    + * - port_xmit_discards - transmission "drops" 
    + * - local_link_integrity_errors - output collisions "ocolls"
    + * - link_downed - output carrier errors "ocarrier"   
    + */
    + int net_index=0;
    +
    + networks ++; // One more IB interface
    + // DEBUG
    + // fprintf(stderr,"* read_ib_counter ib%d, Nets: %d, Path:%s *\n",
    + //  ib_index, networks, counter_path);
    + net_index=networks-1; // Starts from 0, but "networks" shows plain count from 1.
    + sprintf((char *)&p->ifnets.if_name, "RDMA:ib%d", ib_index);
    +
    + // Reading counters.
    + p->ifnets.if_ibytes =
    +  read_ib_counter_file(counter_path, "port_rcv_data");
    + p->ifnets.if_obytes =
    +  read_ib_counter_file(counter_path, "port_xmit_data");
    + p->ifnets.if_ipackets =
    +  read_ib_counter_file(counter_path, "port_rcv_packets");
    + p->ifnets.if_opackets =
    +  read_ib_counter_file(counter_path, "port_xmit_packets");
    + p->ifnets.if_ierrs =
    +  read_ib_counter_file(counter_path, "port_rcv_errors");
    + p->ifnets.if_idrop =
    +  read_ib_counter_file(counter_path, "port_rcv_remote_physical_errors");
    + p->ifnets.if_oerrs =
    +  read_ib_counter_file(counter_path, "port_xmit_constraint_errors");
    + p->ifnets.if_odrop =
    +  read_ib_counter_file(counter_path, "port_xmit_discards");
    + p->ifnets.if_ocolls =
    +  read_ib_counter_file(counter_path, "local_link_integrity_errors");
    + p->ifnets.if_ocarrier =
    +  read_ib_counter_file(counter_path, "link_downed");
    +
    +} // read_ib_counter
    +
    void proc_net()
    +/* This procedure reads normal Ethernet and Infiniband interfaces.
    + *
    + * IB OFED implementation differs from standard networking stuff
    + * and do not show up on the output of "ifconfig" etc.
    + * Therefore, we need to use in this procedure two different sources:
    + * - /proc/net/dev to show IP stats;
    + * - /sys/class/infiniband subsystem.
    + * For IB, naming of interfaces and lids/ports is quite complex.
    + * So, we require user to define Env.Var. "NMON_IB_COUNTERS", showing us
    + * the full pathes to the "counter" files in IB "sys" tree.
    + *
    + * Example:
    + * export NMON_IB_COUNTERS=" '/sys/class/infiniband/mlx4_0/ports/1/counters' \
    + * '/sys/class/infiniband/mlx4_0/ports/2/counters' "
    + *
    + * Here we define pretty much standard Melanox IB dual-port card, assuming that
    + * both ports are used and needs to be accounted for. Bonding of IB interfaces
    + * does not matter here, since it works only on IP level.
    + *
    + * If you use only, let's say, port 2 on your IB card, export only single path:
    + * export NMON_IB_COUNTERS=" '/sys/class/infiniband/mlx4_0/ports/2/counters' "
    + *
    + * This procedure will use pre-defined names of OFED counters to get data.
    + */
    {
    static FILE *fp = (FILE *)-1;
    -char buf;
    +char buf;
    +char *envp; // Env. Var with names of net.interfaces.
    int i=0;
    -int ret;
    +int ret=0;
    +char *t;
    unsigned long junk;

    +// processing "classic" Ethernet files.
    if( fp == (FILE *)-1) {
                if( (fp = fopen("/proc/net/dev","r")) == NULL) {
    error("failed to open - /proc/net/dev");
    - networks=0;
    return;
       }
    }
    @@ -3028,8 +3191,16 @@
       sit0:       0       0    0    0    0     0          0         0        0       0    0    0    0     0       0          0
       eth1:       0       0    0    0    0     0          0         0        0       0    0    0    0     0       0          0
    */
    +
    + // Getting the pointer to the string with names of excluded network interfaces.
    +        envp=getenv("NMON_EXCLUDE_IFS");
    +        // Here we account for space separated list of quoted NETMAX devices.
    +        if (envp && (strlen(envp) > (IF_NAME_MAX_LEN+3)*NETMAX-2) ) // Something is wrong with Env. Var.
    +           envp=NULL;
    +
    for(i=0;i<NETMAX;i++) {
    - if(fgets(buf,1024,fp) == NULL)
    +
    + if(fgets(buf,1024,fp) == NULL) // This would leave i increased.
    break;
    strip_spaces(buf);
         /* 1   2   3    4   5   6   7   8   9   10   11   12  13  14  15  16 */
    @@ -3041,8 +3212,8 @@
    &p->ifnets.if_idrop,
    &p->ifnets.if_ififo,
    &p->ifnets.if_iframe,
    - &junk,
    - &junk,
    + &junk, // Skipping "compressed" ..
    + &junk, // and "multicast".
    &p->ifnets.if_obytes,
    &p->ifnets.if_opackets,
    &p->ifnets.if_oerrs,
    @@ -3050,16 +3221,62 @@
    &p->ifnets.if_ofifo,
    &p->ifnets.if_ocolls,
    &p->ifnets.if_ocarrier
    + // Quietly ignoring last column "compressed".
    );
    if(ret != 16)
    fprintf(stderr,"sscanf wanted 16 returned = %d line=%s\n", ret, (char *)buf);
    - }
    +
    + if(envp) { // Got interfaces to exclude!
    +                // export NMON_EXCLUDE_IFS="'wlan0' 'lo' 'vboxnet0' 'vboxnet2'"
    +                // export NMON_EXCLUDE_IFS="'bond0' 'ib0' 'ib1'"
    +                 sprintf(buf, "'%s'", (char *) &p->ifnets.if_name); // 'wlan0'
    +                 // Is this disk one of them?
    +                  if ( strstr(envp, buf) ) {
    +    i-; // "i" is incremented in the loop condition, so need to decrease to re-use.
    +                   continue;
    +                  }
    +                } // No excluded interfaces.
    +
    + } // End of processing lines from "/proc/net/dev" output.
    +
    + networks = i; // Networks count starts from 0, but we increased i one extra time.
    + // DEBUG
    + // fprintf(stderr,"** inside proc_net - IP Nets: %d **\n", networks);
    +
    + // Start processing of IB interface(s) counter(s).
    + // We do not check them with "exclude" list, since user defines counter
    + // pathes himself and can omit whatever is not necessary.
    +
    +
    + /* Getting the pointer to the string with names of IB counters.
    + * export NMON_IB_COUNTERS=" '/var/tmp/sys/class/infiniband/mlx4_0/ports/1/counters' \
    + * '/var/tmp/sys/class/infiniband/mlx4_0/ports/2/counters' "
    + */
    + buf='\0'; // Null-terminate the buffer.
    + envp=getenv("NMON_IB_COUNTERS"); // Can safely re-use "envp" and "ret" now.
    +        if ( !envp || (strlen(envp) > BUFLEN-1)) // Something is wrong with Env. Var.
    +         goto end;
    + // Making a local copy of environment variable, so I may tokenize it safely.
    + strcpy(buf, envp); // Copying only available chars, without null-padding up to BUFLEN.
    +
    + ret = 0; t=NULL;
    + t=strtok(buf, "' \"");  // first path without quotes spaces or commas
    + // … processing first path
    + read_ib_counter(ret, t);
    + while (t=strtok(NULL, ", '")){ // next path …
    + // … processing this path
    + ret ++;
    + read_ib_counter(ret, t);
    +        };
    +
    + // GOTO label to skip "/proc/net/dev" headers, I want it to stay here,
    + // so I do not re-read IB counters multiple times and follow the general
    + // "networking device" logic of the program.
    end:
    if(reread) {
    fclose(fp);
    fp = (FILE *)-1;
    } else rewind(fp);
    - networks = i;
    }

    @@ -3516,7 +3733,7 @@
                             x=x+(rows);     \
                             if(x+4>LINES) { \
                                     room=0; \
    -                                mvwprintw(stdscr,LINES-1,10,"Warning: Some Statistics may not shown"); \
    +                                mvwprintw(stdscr,LINES-1,10,"Warning: Some Statistics may be not shown"); \
                             }               \
                            }

    @@ -3776,6 +3993,9 @@
    proc_net();
    memcpy(q->ifnets, p->ifnets, sizeof(struct net_stat) * networks);
    for(i=0;i<networks;i++) {
    + // DEBUG
    + // fprintf(stderr, "MAIN:Processing Nets: %d of %d - %s\n",
    + //  i, networks, p->ifnets.if_name);
    net_read_peak=0.0;
    net_write_peak=0.0;
    }
    @@ -4854,13 +5074,15 @@
    (float)p->cpu_total.mins5,
    updays, uphours, upmins);

    - mvwprintw(padker,4, 1, "Interrupts     %8.1f   15 mins %5.2f    Average CPU use=%6.2f%%",
    + mvwprintw(padker,4, 1, "Interrupts     %8.1f   15 mins %5.2f    Average CPU use=%6.2f%% (%d CPUs)",
    (float)(p->cpu_total.intr - q->cpu_total.intr)/elapsed,
    (float)p->cpu_total.mins15,
    (float)(
    (p->cpu_total.uptime -
    p->cpu_total.idletime)/
    - p->cpu_total.uptime *100.0));
    + p->cpu_total.uptime *100.0),
    + cpus
    + );
    DISPLAY(padker,5);
    } else {
    if(proc_first_time) {
    @@ -5087,7 +5309,28 @@
    if(net_write_peak < IFDELTA(if_obytes) / 1024.0)
    net_write_peak = IFDELTA(if_obytes) / 1024.0;

    - CURSE mvwprintw(padnet,2 + i, 0, "%8s %7.1f %7.1f    %6.1f   %6.1f  %6.1f %6.1f    %7.1f %7.1f   ",
    + if ( (IFDELTA(if_ibytes) > 1048576) ||
    +      (IFDELTA(if_obytes) > 1048576) ||
    +      net_read_peak > 1024 || net_write_peak >1024 // Peak times are in KB/s
    +    )
    + { // Reached Megabytes per sec.
    + COLOUR wattrset(padnet,COLOR_PAIR(3));
    + mvwprintw(padnet,0, 31, "some in MB/s");
    + CURSE mvwprintw(padnet,2 + i, 0, "%8s %7.1f %7.1f    %6.1f   %6.1f  %6.1f %6.1f    %7.1f %7.1f MB/s",
    +                                    &p->ifnets.if_name,
    +                                    IFDELTA(if_ibytes) / 1024.0 / 1024.0,
    +                                    IFDELTA(if_obytes) / 1024.0 / 1024.0,
    +                                    IFDELTA(if_ipackets),
    +                                    IFDELTA(if_opackets),
    +                                    IFDELTA_ZERO(if_ibytes, if_ipackets),
    +                                    IFDELTA_ZERO(if_obytes, if_opackets),
    +                                    net_read_peak / 1024.0,
    +                                    net_write_peak / 1024.0
    +                                        );
    + COLOUR wattrset(padnet,COLOR_PAIR(0));
    + }
    + else // Leaving everything in default, Kilobytes per sec.
    + CURSE mvwprintw(padnet,2 + i, 0, "%8s %7.1f %7.1f    %6.1f   %6.1f  %6.1f %6.1f    %7.1f %7.1f             ",
        &p->ifnets.if_name,
        IFDELTA(if_ibytes) / 1024.0,  
        IFDELTA(if_obytes) / 1024.0,
    @@ -5323,7 +5566,8 @@
    case DISK_MODE_IO:         mvwprintw(paddisk, 0, 12, "/proc/stat+disk_io");break;
    }
    mvwprintw(paddisk,0, 31, "mostly in KB/s");
    - mvwprintw(paddisk,0, 50, "Warning:contains duplicates");
    + // reat: not now. I can exclude duplicated disks.
    + // mvwprintw(paddisk,0, 50, "Warning:contains duplicates");
    switch (show_disk) {
    case SHOW_DISK_STATS:
    mvwprintw(paddisk,1, 0, "DiskName Busy    Read    Write       Xfers   Size  Peak%%  Peak-RW    InFlight ");
    $  _

     
  • Denis Cerkvin
    Denis Cerkvin
    2013-04-30

    … Just a short follow-up on 2 other patches I noticed:

    - 2972508 - Patch to fix reading of /proc/vmstat on newer kernels
    - 2899723 - patch to record io wait on newer kernels

    They have been implemented long time ago by program's author and those issues listed are not present in version 14g.

    In terms of IO Waits, I have checked nmon 14g vs "top" on 3.8.8-202 - works correctly.

    For Virtual Memory, that functionality change has already been implemented in 14g.
    New macro "GETVM" uses new procedure "get_vm_value", rather then older
    "read_vmline", mentioned in the patch.

    I have checked 14g with:
    $ cat /proc/vmstat | egrep "(pgfree|pgfault|pgpgin|pgpgout)"; tail -4 /tmp/nmon.err
    pgpgin 1633171
    pgpgout 4779157
    pgfree 20275945
    pgfault 12233041
    … pgpgin:1633171
    … pgpgout:4779157
    … pgfree:20275747
    … pgfault:12232511

    So, this latest version 14g can be fully trusted :-)

    I also have checked IB traffic accounting in my patch and it works properly.

     
  • voran
    voran
    2015-03-13

    Hello,

    I tried to install this patch on nmon 14g but with no success, could you describe the method to do it ?
    The patch to exclude devices disk seems to be missing in 14i.

    Edit : Finaly it's ok, I could use this patch without patch command, then I modified source directly with information in this thread.

    Other way i to use -g option

    Bue ;-)

    Voran

     
    Last edit: voran 2015-03-15