From: Davies L. <dav...@gm...> - 2012-03-03 02:26:00
|
new patch: --- mfs-1.6.26/mfschunkserver/hddspacemgr.c 2012-02-28 16:18:26.000000000 +0800 +++ mfs-1.6.26-r4/mfschunkserver/hddspacemgr.c 2012-03-02 20:33:29.000000000 +0800 @@ -1691,9 +1691,16 @@ } } +static inline uint64_t get_usectime() { + struct timeval tv; + gettimeofday(&tv,NULL); + return ((uint64_t)(tv.tv_sec))*1000000+tv.tv_usec; +} + void hdd_delayed_ops() { dopchunk **ccp,*cc,*tcc; uint32_t dhashpos; + uint64_t ts,te; chunk *c; // int status; // printf("delayed ops: before lock\n"); @@ -1756,6 +1763,22 @@ if (c->opensteps>0) { // decrease counter c->opensteps--; } else if (c->fd>=0) { // close descriptor + ts = get_usectime(); +#ifdef F_FULLFSYNC + if (fcntl(c->fd,F_FULLFSYNC)<0) { + int errmem = errno; + mfs_arg_errlog_silent(LOG_WARNING,"hdd_delayed_ops: file:%s - fsync (via fcntl) error",c->filename); + errno = errmem; + } +#else + if (fsync(c->fd)<0) { + int errmem = errno; + mfs_arg_errlog_silent(LOG_WARNING,"hdd_delayed_ops: file:%s - fsync (direct call) error",c->filename); + errno = errmem; + } +#endif + te = get_usectime(); + hdd_stats_datafsync(c->owner,te-ts); if (close(c->fd)<0) { hdd_error_occured(c); // uses and preserves errno !!! mfs_arg_errlog_silent(LOG_WARNING,"hdd_delayed_ops: file:%s - close error",c->filename); @@ -1792,12 +1815,6 @@ // printf("delayed ops: after unlock\n"); } -static inline uint64_t get_usectime() { - struct timeval tv; - gettimeofday(&tv,NULL); - return ((uint64_t)(tv.tv_sec))*1000000+tv.tv_usec; -} - static int hdd_io_begin(chunk *c,int newflag) { dopchunk *cc; int status; @@ -1891,28 +1908,27 @@ errno = errmem; return status; } - ts = get_usectime(); -#ifdef F_FULLFSYNC - if (fcntl(c->fd,F_FULLFSYNC)<0) { - int errmem = errno; - mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (via fcntl) error",c->filename); - errno = errmem; - return ERROR_IO; - } -#else - if (fsync(c->fd)<0) { - int errmem = errno; - mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (direct call) error",c->filename); - errno = errmem; - return ERROR_IO; - } -#endif - te = get_usectime(); - hdd_stats_datafsync(c->owner,te-ts); } c->crcrefcount--; if (c->crcrefcount==0) { if (OPENSTEPS==0) { + ts = get_usectime(); +#ifdef F_FULLFSYNC + if (fcntl(c->fd,F_FULLFSYNC)<0) { + int errmem = errno; + mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (via fcntl) error",c->filename); + errno = errmem; + } +#else + if (fsync(c->fd)<0) { + int errmem = errno; + mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (direct call) error",c->filename); + errno = errmem; + } +#endif + te = get_usectime(); + hdd_stats_datafsync(c->owner,te-ts); + if (close(c->fd)<0) { int errmem = errno; c->fd = -1; @@ -3766,6 +3782,7 @@ // } prevf = NULL; c = hdd_chunk_get(chunkid,CH_NEW_AUTO); + if (c == NULL) return; if (c->filename!=NULL) { // already have this chunk if (version <= c->version) { // current chunk is older if (todel<2) { // this is R/W fs? 2012/3/1 Chris Picton <ch...@ec...> > In mfs-1.6.20, the OPENSTEPS vs fd closing logic appears a bit flawed > > The test is made in hdd_io_end: > > if (OPENSTEPS==0) { > > However, OPENSTEPS always is > 0, as it is initialised once at the top > of the file, with a #define > > > This means the file descriptors never get closed (by that logic) > They are closed in hdd_delayed_ops() with 5 seconds delay. So I move the fsync() into hdd_delayed_ops(), just before close(). And, fsync() in hdd_term() before close() is also needed. (not in the patch). > Am I reading the code correctly? > > Chris > > On Thu, 2012-03-01 at 16:51 +0800, Davies Liu wrote: > > I can not figure it out how to make fsync frequency configurable, then > > move fsync() just before close(): > > > > > > --- mfs-1.6.26/mfschunkserver/hddspacemgr.c 2012-02-08 > > 16:15:03.000000000 +0800 > > +++ mfs-1.6.26-r1/mfschunkserver/hddspacemgr.c 2012-03-01 > > 16:17:23.000000000 +0800 > > @@ -1887,28 +1887,28 @@ > > errno = errmem; > > return status; > > } > > - ts = get_usectime(); > > -#ifdef F_FULLFSYNC > > - if (fcntl(c->fd,F_FULLFSYNC)<0) { > > - int errmem = errno; > > - mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: > > file:%s - fsync (via fcntl) error",c->filename); > > - errno = errmem; > > - return ERROR_IO; > > - } > > -#else > > - if (fsync(c->fd)<0) { > > - int errmem = errno; > > - mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: > > file:%s - fsync (direct call) error",c->filename); > > - errno = errmem; > > - return ERROR_IO; > > - } > > -#endif > > - te = get_usectime(); > > - hdd_stats_datafsync(c->owner,te-ts); > > } > > c->crcrefcount--; > > if (c->crcrefcount==0) { > > if (OPENSTEPS==0) { > > + ts = get_usectime(); > > +#ifdef F_FULLFSYNC > > + if (fcntl(c->fd,F_FULLFSYNC)<0) { > > + int errmem = errno; > > + > > mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (via > > fcntl) error",c->filename); > > + errno = errmem; > > + return ERROR_IO; > > + } > > +#else > > + if (fsync(c->fd)<0) { > > + int errmem = errno; > > + > > mfs_arg_errlog_silent(LOG_WARNING,"hdd_io_end: file:%s - fsync (direct > > call) error",c->filename); > > + errno = errmem; > > + return ERROR_IO; > > + } > > +#endif > > + te = get_usectime(); > > + hdd_stats_datafsync(c->owner,te-ts); > > if (close(c->fd)<0) { > > int errmem = errno; > > c->fd = -1; > > > > On Thu, Mar 1, 2012 at 12:21 PM, Chris Picton <ch...@ec...> > > wrote: > > I have had similar ideas > > > > I currently have a patch to disable fsync on every block > > close, however this will probably lead to data corruption if > > there is a site-wide power outage. > > > > My thoughts are as follows: > > * Create config variable FLUSH_ON_WRITE (0/1) to disable > > or enable flush on write > > * Create config variable FLUSH_DELAY (seconds) to > > prevent flushing immediately - rather the pointers to > > the written chunks would be stored, and looped through > > (in a separate thread?), to flush any which are older > > than the delay. This would ensure that the chunks > > have a maximum time during which they may potentially > > be invalid on disk. If the FLUSH_DELAY is 0, then > > behaviour is as current > > * Create config variable CHECKSUM_INITIAL (0/1) If set > > to 1 would force a checksum of *all* blocks on a > > chunkserver at startup, to find potentially bad chunks > > before they are used. Is this necessary, though? Are > > checksums read on every block read? > > I may start making the above patches, if I get time to do so. > > > > Chris > > > > > > > > > > On 2012/03/01 6:09 AM, Davies Liu wrote: > > > Hi Michal! > > > > > > > > > I have found the reason for bad write performance, the chunk > > > server will write the crc block > > > into disk EVERY second, then fsync(), which will take about > > > 28ms. > > > > > > Can I reduce frequency of fsync() to every minute, then > > > check the chunk modified in recent > > > 30 minutes after booting? > > > > > > Davies > > > > > > 2012/2/23 Michał Borychowski <mic...@ge...> > > > Hi Davies! > > > > > > Here is our analysis of this situation. Different > > > files are written simultaneously on the same CS - > > > that's why pwrites are written do different files. > > > Block size of 64kB is not that small. Writes in OS > > > are sent through write cache so all saves being > > > multiplication of 4096B should work equally fast. > > > > > > Our tests: > > > > > > dd on Linux 64k : 640k > > > $ dd if=/dev/zero of=/tmp/test bs=64k count=10000 > > > 10000+0 records in > > > 10000+0 records out > > > 655360000 bytes (655 MB) copied, 22.1836 s, 29.5 > > > MB/s > > > > > > $ dd if=/dev/zero of=/tmp/test bs=640k count=1000 > > > 1000+0 records in > > > 1000+0 records out > > > 655360000 bytes (655 MB) copied, 23.1311 s, 28.3 > > > MB/s > > > > > > dd on Mac OS X 64k : 640k > > > $ dd if=/dev/zero of=/tmp/test bs=64k count=10000 > > > 10000+0 records in > > > 10000+0 records out > > > 655360000 bytes transferred in 14.874652 secs > > > (44058846 bytes/sec) > > > > > > $ dd if=/dev/zero of=/tmp/test bs=640k count=1000 > > > 1000+0 records in > > > 1000+0 records out > > > 655360000 bytes transferred in 14.578427 secs > > > (44954096 bytes/sec) > > > > > > So the times are similar. Saves going to different > > > files also should not be a problem as a kernel > > > scheduler takes care of this. > > > > > > If you have some specific idea how to improve the > > > saves please share it with us. > > > > > > > > > Kind regards > > > Michał > > > > > > -----Original Message----- > > > From: Davies Liu [mailto:dav...@gm...] > > > Sent: Wednesday, February 22, 2012 8:24 AM > > > To: moo...@li... > > > Subject: [Moosefs-users] Bad write performance of > > > mfschunkserver > > > > > > Hi,devs: > > > > > > Today, We found that some mfschunkserver were not > > > responsive, caused many timeout in mfsmount, then > > > all the write operation were blocked. > > > > > > After some digging, we found that there were some > > > small but continuous write bandwidth, strace show > > > that many small pwrite() between several files: > > > > > > [pid 7087] 12:28:28 pwrite(19, "baggins3 > > > 60.210.18.235 sE7NtNQU7"..., 25995, 55684725 > > > <unfinished ...> [pid 7078] 12:28:28 pwrite(17, > > > "2012/02/22 12:28:28:root: WARNIN"..., 69, 21768909 > > > <unfinished ...> [pid 7080] 12:28:28 pwrite(20, > > > "gardner4 183.7.50.169 mr5vi+Z4H3"..., 47663, > > > 34550257 <unfinished ...> [pid 7079] 12:28:28 > > > pwrite(19, "\" \"Mozilla/5.0 (Windows NT 6.1) "..., > > > 40377, 55710720 <unfinished ...> [pid 7086] > > > 12:28:28 pwrite(23, "MATP; InfoPath.2; .NET4.0C; > > > 360S"..., 65536, 6427648 <unfinished ...> [pid > > > 7082] 12:28:28 pwrite(23, "; GTB7.2; SLCC2; .NET > > > CLR 2.0.50"..., 65536, 6493184 <unfinished ...> > > > [pid 7083] 12:28:28 pwrite(20, "\255BYU\355\237\347 > > > \226s\261\307N{A\355\203S\306\244\255\322[\322\rJ > > > \32[z3\31\311\327"..., > > > 4096, 1024 <unfinished ...> > > > [pid 7078] 12:28:28 pwrite(23, > > > "ovie/subject/4724373/reviews?sta"..., > > > 65536, 6558720 <unfinished ...> > > > [pid 7080] 12:28:28 pwrite(19, > > > "[\"[\4\5\266v\324\366\245n\t\315\202\227\\\343= > > > \336-\r > > > k)\316\354\335\353\373\340\331;"..., 4096, 1024 > > > <unfinished ...> [pid 7079] 12:28:28 pwrite(23, > > > "ta-Python/2.0.15\" > > > 0.016\n211.147."..., 65536, 6624256 <unfinished ...> > > > [pid 7081] 12:28:28 pwrite(23, > > > "4034093?apikey=0eb695f25995d7eb2"..., > > > 65536, 6689792 <unfinished ...> > > > [pid 7084] 12:28:28 pwrite(23, " > > > y8G23n95BKY:43534427:wind8vssc4"..., > > > 65536, 6755328) = 65536 <0.000108> > > > [pid 7078] 12:28:28 pwrite(23, > > > "TkVvKuXfug:3248233:5Yo9vFoOIuo \""..., 65536, > > > 6820864 <unfinished ...> [pid 7086] 12:28:28 > > > pwrite(23, ":s|1563396:s|1040897:s|1395290:s"..., > > > 65536, 6886400 <unfinished ...> > > > [pid 7085] 12:28:28 pwrite(23, "dows%3B%20U%3B% > > > 20Windows%20NT%20"..., > > > 65536, 6951936 <unfinished ...> > > > [pid 7087] 12:28:28 pwrite(23, "/533.17.9 (KHTML, > > > like Gecko) Ve"..., 65536, 7017472 <unfinished ...> > > > [pid 7079] 12:28:28 pwrite(23, " r1m+tFW1T5M:: > > > \"22/Feb/2012:00:0"..., 65536, 7083008 > > > <unfinished ...> [pid 7086] 12:28:28 pwrite(19, > > > "baggins5 61.174.60.117 i6MSCBvE1"..., 25159, > > > 55751097 <unfinished ...> [pid 7084] 12:28:28 > > > pwrite(20, "gardner1 182.118.7.64 TjxzPKdqNU"..., > > > 10208, 34597920 <unfinished ...> [pid 7080] > > > 12:28:28 pwrite(23, "d7eb2c23c1d70cc187c1&alt=json > > > HT"..., 65536, 7148544 <unfinished ...> [pid 7083] > > > 12:28:28 pwrite(23, > > > "5_Google&type=n&channel=-3&user_"..., > > > 65536, 7214080 <unfinished ...> > > > [pid 7085] 12:28:28 pwrite(19, "12-02-22 12:28:27 > > > 1861 \"GET /ser"..., 23179, 55776256 > > > <unfinished ...> [pid 7082] 12:28:28 pwrite(23, > > > "\"http://douban.fm/swf/53035/fmpl"..., 65536, > > > 7279616 <unfinished ...> [pid 7078] 12:28:28 > > > pwrite(20, "opic/27639291/add_comment HTTP/1"..., > > > 18576, 34608128 <unfinished ...> [pid 7087] > > > 12:28:28 pwrite(19, "[\"[\4\5\266v\324\366\245n\t > > > \315\202\227\\\343=\336-\r > > > k)\316\354\335\353\373\340\331;"..., 4096, 1024 > > > <unfinished ...> [pid 7079] 12:28:28 pwrite(23, > > > "ww.douban.com%2Fgroup%2Ftopic%2F"..., > > > 65536, 7345152 <unfinished ...> > > > [pid 7081] 12:28:28 pwrite(20, > > > "\255BYU\355\237\347\226s\261\307N{A\355\203S\306 > > > \244\255\322[\322\rJ\32[z3\31\311\327"..., > > > 4096, 1024 <unfinished ...> > > > [pid 7086] 12:28:28 pwrite(23, "patible; MSIE 7.0; > > > Windows NT 6."..., 65536, 7410688 <unfinished ...> > > > [pid 7084] 12:28:28 pwrite(23, "fari/535.7 360EE\" > > > 0.006\n211.147."..., 65536, 7476224 <unfinished ...> > > > [pid 7080] 12:28:28 pwrite(23, "1:OUIVR8CIG5c > > > \"22/Feb/2012:00:03"..., 65536, 7541760 > > > <unfinished ...> [pid 7085] 12:28:28 pwrite(23, "fm > > > \"GET /j/mine/playlist?type=s&"..., 65536, 7607296 > > > <unfinished ...> [pid 7083] 12:28:28 pwrite(23, > > > "pe=n&channel=18&user_id=39266798"..., > > > 65536, 7672832 <unfinished ...> > > > [pid 7082] 12:28:28 pwrite(23, " 0.023 > > > \n125.34.190.128 :: > > > \"22/Feb"..., 65536, 7738368 <unfinished ...> [pid > > > 7078] 12:28:28 pwrite(23, "00 5859 > > > \"http://www.douban.com/p"..., 65536, 7803904 > > > <unfinished ...> [pid 7079] 12:28:28 pwrite(23, > > > "03:08 +0800\" www.douban.com \"GET"..., 65536, > > > 7869440 <unfinished ...> [pid 7086] 12:28:28 > > > pwrite(23, "type=all HTTP/1.1\" 200 1492 \"-\" > > > "..., 65536, 7934976 <unfinished ...> > > > [pid 7084] 12:28:28 pwrite(23, > > > "Hiapk&user_id=57982902&expire=13"..., > > > 65536, 8000512 <unfinished ...> > > > [pid 7080] 12:28:28 pwrite(23, "0.011 > > > \n116.253.89.216 rxASuWZf1wg"..., 65536, 8066048 > > > <unfinished ...> [pid 7085] 12:28:28 pwrite(23, "9 > > > +0800\" www.douban.com \"GET /ph"..., 65536, > > > 8131584) = 65536 <0.000062> [pid 7083] 12:28:28 > > > pwrite(23, " +0800\" www.douban.com \"GET /eve"..., > > > 65536, 8197120 <unfinished ...> [pid 7082] 12:28:28 > > > pwrite(23, " +0800\" www.douban.com \"POST /se"..., > > > 65536, 8262656) = 65536 <0.000103> [pid 7087] > > > 12:28:28 pwrite(23, "0 12971 > > > \"http://www.douban.com/g"..., 65536, 8328192 > > > <unfinished ...> [pid 7081] 12:28:28 pwrite(23, ".0 > > > (compatible; MSIE 7.0; Window"..., 65536, 8393728) = > > > 65536 <0.000065> > > > > > > In order to get better performance, the chunk server > > > should merge the continuous sequential write > > > operations into larger ones. > > > > > > -- > > > - Davies > > > > > > > > > > ------------------------------------------------------------------------------ > > > Virtualization & Cloud Management Using Capacity > > > Planning Cloud computing makes use of virtualization > > > - but cloud computing also focuses on allowing > > > computing to be delivered as a service. > > > http://www.accelacomm.com/jaw/sfnl/114/51521223/ > > > _______________________________________________ > > > moosefs-users mailing list > > > moo...@li... > > > > https://lists.sourceforge.net/lists/listinfo/moosefs-users > > > > > > > > > > > > > > > > > > -- > > > - Davies > > > > > > > > > > > > > ------------------------------------------------------------------------------ > > > Virtualization & Cloud Management Using Capacity Planning > > > Cloud computing makes use of virtualization - but cloud > computing > > > also focuses on allowing computing to be delivered as a > service. > > > http://www.accelacomm.com/jaw/sfnl/114/51521223/ > > > > > > > > > _______________________________________________ > > > moosefs-users mailing list > > > moo...@li... > > > https://lists.sourceforge.net/lists/listinfo/moosefs-users > > > > > > > > > > > > > > -- > > - Davies > > > > -- > Chris Picton > > Executive Manager - Systems > ECN Telecommunications (Pty) Ltd > t: 010 590 0031 m: 079 721 8521 > f: 087 941 0813 > e: ch...@ec... > > "Lowering the cost of doing business" > > > -- - Davies |