From: Krzysztof K. <krz...@mo...> - 2015-06-10 09:33:15
|
Dear Eugene, In MooseFS 2.0.68 source you’ll find, that the message “connect failed, error: ETIMEDOUT” is logged by mainserv_connect() function in mfschunkserver, when TCP connection times out. 167 int mainserv_connect(uint32_t fwdip,uint16_t fwdport,uint32_t timeout) { 168 int fwdsock; 169 fwdsock = tcpsocket(); 170 if (fwdsock<0) { 171 mfs_errlog(LOG_WARNING,"create socket, error"); 172 return -1; 173 } 174 if (tcpnonblock(fwdsock)<0) { 175 mfs_errlog(LOG_WARNING,"set nonblock, error"); 176 tcpclose(fwdsock); 177 return -1; 178 } 179 if (tcpnumtoconnect(fwdsock,fwdip,fwdport,timeout)<0) { 180 mfs_errlog(LOG_WARNING,"connect failed, error"); 181 tcpclose(fwdsock); 182 return -1; 183 } 184 if (tcpnodelay(fwdsock)<0) { 185 mfs_errlog(LOG_WARNING,"can't set TCP_NODELAY, error"); 186 } 187 return fwdsock; 188 } The function mainserv_connect() is called in the following loop within mfschunkserver's mainserv_write() function, that retires the connection if it was not successful (i.e. resulted in ETIMEDOUT): 1112 for (i=0 ; i<CONNECT_RETRIES && fwdsock<0 ; i++) { 1113 #ifdef USE_CONNCACHE 1114 if (i==0) { 1115 fwdsock = conncache_get(fwdip,fwdport); 1116 } 1117 if (fwdsock<0) { 1118 fwdsock = mainserv_connect(fwdip,fwdport,CONNECT_TIMEOUT(i)); 1119 } 1120 #else 1121 fwdsock = mainserv_connect(fwdip,fwdport,CONNECT_TIMEOUT(i)); 1122 #endif 1123 if (fwdsock>=0) { 1124 packet = mainserv_create_packet(&wptr,CLTOCS_WRITE,length-6); 1125 if (protover) { 1126 put8bit(&wptr,protover); 1127 } 1128 put64bit(&wptr,gchunkid); 1129 put32bit(&wptr,gversion); 1130 if (protover) { 1131 memcpy(wptr,data,length-13-6); 1132 } else { 1133 memcpy(wptr,data,length-12-6); 1134 } 1135 if (mainserv_send_and_free(fwdsock,packet,length-6)) { 1136 break; 1137 } 1138 tcpclose(fwdsock); 1139 fwdsock=-1; 1140 } 1141 } As you can see each time the connection is retired (by design system is retrying 10 times) the timeout value is adjusted according to the following formulas: 63 #define CONNECT_RETRIES 10 64 #define CONNECT_TIMEOUT(cnt) (((cnt)%2)?(300*(1<<((cnt)>>1))):(200*(1<<((cnt)>>1)))) Using simple program provided below you can see what the timeout values are for each iteration: 1 #include <stdio.h> 2 3 #define CONNECT_RETRIES 10 4 #define CONNECT_TIMEOUT(cnt) (((cnt)%2)?(300*(1<<((cnt)>>1))):(200*(1<<((cnt)>>1)))) 5 6 int main() { 7 for(int i=0; i<CONNECT_RETRIES; i++) { 8 printf("%d, ", CONNECT_TIMEOUT(i)); 9 } 10 printf("\n"); 11 return 0; 12 } $ gcc -o retry retry.c $ ./retry 200, 300, 400, 600, 800, 1200, 1600, 2400, 3200, 4800, So in your case it might be some network issue (network congestion?), that resulted in few retries on TCP connect. It’s also important to note, that although there were some timeouts, the request was processed successfully due to the fact that the system has retry capability and the message is a WARNING not an ERROR that’s why in changelog you will not see any issues. Should the connection fail after predefined number of retries (default 10) the client would get the packet back with the information that there was an error during I/O transaction processing. Message "got unknown message (type:212)” was a result of the small bug in processing i/o when timeout occurs and has beed fixed in the release that we will release today/tomorrow. Best Regards, Krzysztof Kielak Director of Operations and Customer Support Mobile: +48 601 476 440 > On 10 Jun 2015, at 07:13, Eugene Diatlov <it...@da...> wrote: > > Hi, > > How to debug such messages? > > running 2.0.68 version. > > tail -f /var/log/daemon.log > Jun 10 01:05:29 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:29 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:29 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:30 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:30 node1 mfschunkserver[1603]: got unknown message (type:212) > Jun 10 01:05:30 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:30 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:31 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:31 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:32 node1 mfschunkserver[1603]: connect failed, error: ETIMEDOUT (Operation timed out) > Jun 10 01:05:32 node1 mfschunkserver[1603]: got unknown message (type:212) > > At the same time other logs (and system at all) looks stable. > tail -f ./changelog_ml.0.mfs > 55486822: 1433913127|CREATE(34818,77615220398_1_20150610015234_00000128.mp4,1,420,18,0,0,0):3570 > 55486823: 1433913127|ACQUIRE(321087,3570) > 55486824: 1433913127|CREATE(17649,77619977594_1_20150610030115_00000085.mp4,1,420,18,0,0,0):3574 > 55486825: 1433913127|ACQUIRE(321090,3574) > 55486826: 1433913127|WRITE(3570,0,1):7076220 > 55486827: 1433913127|CREATE(4511,77618939592_1_20150609125244_00000612.mp4,1,420,18,0,0,0):3575 > 55486828: 1433913127|ACQUIRE(321089,3575) > 55486829: 1433913127|CREATE(172741,77612131058_1_20150610040406_00000043.mp4,1,420,18,0,0,0):3576 > 55486830: 1433913127|ACQUIRE(321088,3576) > 55486831: 1433913127|WRITE(3574,0,1):7076221 > 55486832: 1433913127|WRITE(3576,0,1):7076222 > 55486833: 1433913127|WRITE(3575,0,1):7076223 > 55486834: 1433913127|LENGTH(3574,4187102) > 55486835: 1433913127|UNLOCK(7076221) > > > -- > Best regards, > Head of R&D department > Eugene Diatlov > http://datalink.ua <http://datalink.ua/> > ------------------------------------------------------------------------------ > _________________________________________ > moosefs-users mailing list > moo...@li... > https://lists.sourceforge.net/lists/listinfo/moosefs-users |