From: Knut P. <Knu...@t-...> - 2005-07-30 09:20:25
|
Hi everybody, could you please verify this change to bitblit.c. Advantage: For drivers with accelerated blitting a significant performance boost. The execution time for my test3 with cyblafb and ypanning drops from 0,870s (2.6.13-rc4 with the patch from Antonino) to 0,772s. If you agree that this is the way to go, we should add some additional code for font resolutions different from 8x16. Should we drop fb_pad_aligned() entirely from the kernel? As far as I see it is only used in cases that do not copy too much data. Replacing fb_pad_aligned at those places with proper inline code should also help to optimize performance. cu, Knut --- linux-2.6.13-rc4/drivers/video/console/bitblit.c 2005-07-29 10:01:01.000000000 +0200 +++ linux/drivers/video/console/bitblit.c 2005-07-30 09:34:49.000000000 +0200 @@ -113,12 +113,15 @@ unsigned int maxcnt = info->pixmap.size/cellsize; unsigned int scan_align = info->pixmap.scan_align - 1; unsigned int buf_align = info->pixmap.buf_align - 1; - unsigned int shift_low = 0, mod = vc->vc_font.width % 8; - unsigned int shift_high = 8, pitch, cnt, size, k; + unsigned int shift_low = 0; + unsigned int mod = vc->vc_font.width % 8; unsigned int idx = vc->vc_font.width >> 3; unsigned int attribute = get_attribute(info, scr_readw(s)); + unsigned int shift_high = 8; + unsigned int pitch, cnt, size, k; + unsigned int fast_8x16, i, j; struct fb_image image; - u8 *src, *dst, *buf = NULL; + u8 *src, *dst, *dstp, *buf = NULL; if (attribute) { buf = kmalloc(cellsize, GFP_KERNEL); @@ -134,6 +137,11 @@ image.height = vc->vc_font.height; image.depth = 1; + if (vc->vc_font.height == 16 && vc->vc_font.width == 8) + fast_8x16 = 1; + else + fast_8x16 = 0; + while (count) { if (count > maxcnt) cnt = k = maxcnt; @@ -147,7 +155,8 @@ size &= ~buf_align; dst = fb_get_buffer_offset(info, &info->pixmap, size); image.data = dst; - if (mod) { + + if (!mod) while (k--) { src = vc->vc_font.data + (scr_readw(s++)& charmask)*cellsize; @@ -157,15 +166,23 @@ src = buf; } - fb_pad_unaligned_buffer(dst, pitch, src, idx, - image.height, shift_high, - shift_low, mod); - shift_low += mod; - dst += (shift_low >= 8) ? width : width - 1; - shift_low &= 7; - shift_high = 8 - shift_low; + // Optimized for speed, memcpy() is too slow! + dstp = dst; + if (fast_8x16) + for (i = 16; i--; ) { + *dstp = src[15-i]; + dstp += pitch; + } + else + for (i = image.height; i--; ) { + for (j = 0; j < idx; j++) + dstp[j] = src[j]; + src += idx; + dstp += pitch; + } + dst += width; } - } else { + else while (k--) { src = vc->vc_font.data + (scr_readw(s++)& charmask)*cellsize; @@ -175,10 +192,15 @@ src = buf; } - fb_pad_aligned_buffer(dst, pitch, src, idx, image.height); - dst += width; + fb_pad_unaligned_buffer(dst, pitch, src, idx, + image.height, shift_high, + shift_low, mod); + shift_low += mod; + dst += (shift_low >= 8) ? width : width - 1; + shift_low &= 7; + shift_high = 8 - shift_low; } - } + info->fbops->fb_imageblit(info, &image); image.dx += cnt * vc->vc_font.width; count -= cnt; |
From: Antonino A. D. <ad...@gm...> - 2005-07-30 13:01:06
|
Knut Petersen wrote: > Hi everybody, > > could you please verify this change to bitblit.c. > > Advantage: For drivers with accelerated blitting a significant > performance boost. > > The execution time for my test3 with cyblafb and ypanning drops from > 0,870s (2.6.13-rc4 with the patch from Antonino) to 0,772s. > > If you agree that this is the way to go, we should add some additional > code for font > resolutions different from 8x16. > > Should we drop fb_pad_aligned() entirely from the kernel? As far as I > see it is only > used in cases that do not copy too much data. Replacing fb_pad_aligned > at those places > with proper inline code should also help to optimize performance. > > > cu, > Knut > > > --- linux-2.6.13-rc4/drivers/video/console/bitblit.c 2005-07-29 > 10:01:01.000000000 +0200 > +++ linux/drivers/video/console/bitblit.c 2005-07-30 > 09:34:49.000000000 +0200 > @@ -113,12 +113,15 @@ > unsigned int maxcnt = info->pixmap.size/cellsize; > unsigned int scan_align = info->pixmap.scan_align - 1; > unsigned int buf_align = info->pixmap.buf_align - 1; > - unsigned int shift_low = 0, mod = vc->vc_font.width % 8; > - unsigned int shift_high = 8, pitch, cnt, size, k; > + unsigned int shift_low = 0; > + unsigned int mod = vc->vc_font.width % 8; > unsigned int idx = vc->vc_font.width >> 3; > unsigned int attribute = get_attribute(info, scr_readw(s)); > + unsigned int shift_high = 8; > + unsigned int pitch, cnt, size, k; > + unsigned int fast_8x16, i, j; > struct fb_image image; > - u8 *src, *dst, *buf = NULL; > + u8 *src, *dst, *dstp, *buf = NULL; > > if (attribute) { > buf = kmalloc(cellsize, GFP_KERNEL); > @@ -134,6 +137,11 @@ > image.height = vc->vc_font.height; > image.depth = 1; > > + if (vc->vc_font.height == 16 && vc->vc_font.width == 8) > + fast_8x16 = 1; > + else > + fast_8x16 = 0; > + > while (count) { > if (count > maxcnt) > cnt = k = maxcnt; > @@ -147,7 +155,8 @@ > size &= ~buf_align; > dst = fb_get_buffer_offset(info, &info->pixmap, size); > image.data = dst; > - if (mod) { > + > + if (!mod) > while (k--) { > src = vc->vc_font.data + (scr_readw(s++)& > charmask)*cellsize; > @@ -157,15 +166,23 @@ > src = buf; > } > > - fb_pad_unaligned_buffer(dst, pitch, src, idx, > - image.height, shift_high, > - shift_low, mod); > - shift_low += mod; > - dst += (shift_low >= 8) ? width : width - 1; > - shift_low &= 7; > - shift_high = 8 - shift_low; > + // Optimized for speed, memcpy() is too slow! > + dstp = dst; > + if (fast_8x16) > + for (i = 16; i--; ) { > + *dstp = src[15-i]; > + dstp += pitch; > + } why not make it to something like this? for (i = cellsize; i--;) { *dstp = src[cellsize-1-i]; dstp += pitch; } This way, it will work with any fontsize, as long as the width is a 8-bit size-aligned. You can do the same with fb_pad_aligned_buffer(), so the rest can benefit too. The above is probably a bit slower than your version, but it's a good compromise -- bit_putcs is ugly enough, the price for the optimization. Tony PS: Next time you submit a patch, add a Signed-off-line. See this document by akpm: http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt |
From: Knut P. <Knu...@t-...> - 2005-07-30 18:42:17
|
Hi Antonino, >> + if (fast_8x16) >> + for (i = 16; i--; ) { >> + *dstp = src[15-i]; >> + dstp += pitch; >> + } > > > why not make it to something like this? > > for (i = cellsize; i--;) { > *dstp = src[cellsize-1-i]; > dstp += pitch; > } > > This way, it will work with any fontsize, as long as the width > is a 8-bit size-aligned. You can do the same with > fb_pad_aligned_buffer(), so the rest can benefit too. > fb_pad_unaligned() could be optimized localy, there are other cases of memcpy(), e.g. in softcursor, etc. Well, this was nothing but a quick ´n dirty hack, I tried to find out if there is still potential for optimizations. If people agree that the gain in speed justifies it, I will try to look at those parts of the kernel _after_ I have submitted my cyberblade/i1 driver. I have to admit that I ´m not familar (enough) with the layers above the hardware driver level right now. > PS: Next time you submit a patch, add a Signed-off-line. ack. btw, should a new driver first be presented in lin...@li... or lkml? cu, Knut |
From: Antonino A. D. <ad...@gm...> - 2005-07-31 00:55:00
|
Knut Petersen wrote: >> PS: Next time you submit a patch, add a Signed-off-line. > > ack. btw, should a new driver first be presented in > lin...@li... or lkml? > Always include linux-fbdev-devel. But you may CC lkml. Tony |
From: James S. <jsi...@in...> - 2005-08-03 17:34:43
|
> why not make it to something like this? > > for (i = cellsize; i--;) { > *dstp = src[cellsize-1-i]; > dstp += pitch; > } > > This way, it will work with any fontsize, as long as the width > is a 8-bit size-aligned. You can do the same with > fb_pad_aligned_buffer(), so the rest can benefit too. > > The above is probably a bit slower than your version, but it's a > good compromise -- bit_putcs is ugly enough, the price for > the optimization. > > Tony > > PS: Next time you submit a patch, add a Signed-off-line. See this > document by akpm: > > http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt If you embedded everything then there is no reason to do massivce copies. Right now this is not the highest priority. We have alot of other fixes that need to go in. |