|
From: Antonino D. <ad...@po...> - 2003-01-14 12:15:20
|
James,
Heres a patch against 2.5.56 and your latest fbdev.diff:
a. fix for cfb_imageblit so it can handle monochrome bitmaps with widths
not a multiple of 8 (12x22, 4x6 fonts should now work)
b. further optimization of fast_imageblit() by removing unnecessary
steps from its main loop.
c. fast_imageblit() should now work for bitmap widths which are least
divisible by 4 (12x22 and 4x6 fonts should now go to fast_imageblit()
instead of slow_imageblit().
c. Fix for fast_imageblit() so it always refer to mask tables in 32-bits
which should make it work for 64-bit machines.
d. insert info->fbops->fb_sync() where it is needed: ie,
cfb_{imageblit,fillrect,copyarea} and before the actual read/write in
fb_write and fb_read.
e. trivial: wrap text at 80 columns
Tony
diff -Naur linux-2.5.56-fbdev/drivers/video/cfbcopyarea.c linux/drivers/video/cfbcopyarea.c
--- linux-2.5.56-fbdev/drivers/video/cfbcopyarea.c 2003-01-14 11:34:35.000000000 +0000
+++ linux/drivers/video/cfbcopyarea.c 2003-01-14 01:21:49.000000000 +0000
@@ -65,13 +65,15 @@
// Single word
if (last)
first &= last;
- FB_WRITEL((*src & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL((*src & first) | (FB_READL(dst) & ~first),
+ dst);
} else {
// Multiple destination words
// Leading bits
if (first) {
- FB_WRITEL((*src & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL((*src & first) | (FB_READL(dst) &
+ ~first), dst);
dst++;
src++;
n -= BITS_PER_LONG-dst_idx;
@@ -94,7 +96,8 @@
FB_WRITEL(*src++, dst++);
// Trailing bits
if (last)
- FB_WRITEL((*src & last) | (FB_READL(dst) & ~last), dst);
+ FB_WRITEL((*src & last) | (FB_READL(dst) &
+ ~last), dst);
}
} else {
// Different alignment for source and dest
@@ -108,15 +111,18 @@
first &= last;
if (shift > 0) {
// Single source word
- FB_WRITEL(((*src >> right) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((*src >> right) & first) |
+ (FB_READL(dst) & ~first), dst);
} else if (src_idx+n <= BITS_PER_LONG) {
// Single source word
- FB_WRITEL(((*src << left) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((*src << left) & first) |
+ (FB_READL(dst) & ~first), dst);
} else {
// 2 source words
d0 = *src++;
d1 = *src;
- FB_WRITEL(((d0 << left | d1 >> right) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0<<left | d1>>right) & first) |
+ (FB_READL(dst) & ~first), dst);
}
} else {
// Multiple destination words
@@ -124,13 +130,15 @@
// Leading bits
if (shift > 0) {
// Single source word
- FB_WRITEL(((d0 >> right) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0 >> right) & first) |
+ (FB_READL(dst) & ~first), dst);
dst++;
n -= BITS_PER_LONG-dst_idx;
} else {
// 2 source words
d1 = *src++;
- FB_WRITEL(((d0 << left | d1 >> right) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0<<left | d1>>right) & first) |
+ (FB_READL(dst) & ~first), dst);
d0 = d1;
dst++;
n -= BITS_PER_LONG-dst_idx;
@@ -164,11 +172,15 @@
if (last) {
if (m <= right) {
// Single source word
- FB_WRITEL(((d0 << left) & last) | (FB_READL(dst) & ~last), dst);
+ FB_WRITEL(((d0 << left) & last) |
+ (FB_READL(dst) & ~last),
+ dst);
} else {
// 2 source words
d1 = *src;
- FB_WRITEL(((d0 << left | d1 >> right) & last) | (FB_READL(dst) & ~last), dst);
+ FB_WRITEL(((d0<<left | d1>>right) &
+ last) | (FB_READL(dst) &
+ ~last), dst);
}
}
}
@@ -208,12 +220,14 @@
// Single word
if (last)
first &= last;
- FB_WRITEL((*src & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL((*src & first) | (FB_READL(dst) & ~first),
+ dst);
} else {
// Multiple destination words
// Leading bits
if (first) {
- FB_WRITEL((*src & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL((*src & first) | (FB_READL(dst) &
+ ~first), dst);
dst--;
src--;
n -= dst_idx+1;
@@ -237,7 +251,8 @@
// Trailing bits
if (last)
- FB_WRITEL((*src & last) | (FB_READL(dst) & ~last), dst);
+ FB_WRITEL((*src & last) | (FB_READL(dst) &
+ ~last), dst);
}
} else {
// Different alignment for source and dest
@@ -251,15 +266,18 @@
first &= last;
if (shift < 0) {
// Single source word
- FB_WRITEL((*src << left & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL((*src << left & first) |
+ (FB_READL(dst) & ~first), dst);
} else if (1+(unsigned long)src_idx >= n) {
// Single source word
- FB_WRITEL(((*src >> right) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((*src >> right) & first) |
+ (FB_READL(dst) & ~first), dst);
} else {
// 2 source words
d0 = *src--;
d1 = *src;
- FB_WRITEL(((d0 >> right | d1 << left) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0>>right | d1<<left) & first) |
+ (FB_READL(dst) & ~first), dst);
}
} else {
// Multiple destination words
@@ -267,13 +285,15 @@
// Leading bits
if (shift < 0) {
// Single source word
- FB_WRITEL(((d0 << left) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0 << left) & first) |
+ (FB_READL(dst) & ~first), dst);
dst--;
n -= dst_idx+1;
} else {
// 2 source words
d1 = *src--;
- FB_WRITEL(((d0 >> right | d1 << left) & first) | (FB_READL(dst) & ~first), dst);
+ FB_WRITEL(((d0>>right | d1<<left) & first) |
+ (FB_READL(dst) & ~first), dst);
d0 = d1;
dst--;
n -= dst_idx+1;
@@ -307,12 +327,15 @@
if (last) {
if (m <= left) {
// Single source word
- FB_WRITEL(((d0 >> right) & last) | (FB_READL(dst) & ~last), dst);
+ FB_WRITEL(((d0 >> right) & last) |
+ (FB_READL(dst) & ~last),
+ dst);
} else {
// 2 source words
d1 = *src;
- FB_WRITEL(((d0 >> right | d1 << left) & last) |
- (FB_READL(dst) & ~last), dst);
+ FB_WRITEL(((d0>>right | d1<<left) &
+ last) | (FB_READL(dst) &
+ ~last), dst);
}
}
}
@@ -364,17 +387,21 @@
(area->sy + area->height) > vyres)
return;
- if (area->dy > area->sy || (area->dy == area->sy && area->dx > area->sx)) {
+ if (area->dy > area->sy || (area->dy == area->sy &&
+ area->dx > area->sx)) {
area->dy += area->height;
area->sy += area->height;
rev_copy = 1;
}
- dst = src = (unsigned long *)((unsigned long)p->screen_base & ~(BYTES_PER_LONG-1));
+ dst = src = (unsigned long *)((unsigned long)p->screen_base &
+ ~(BYTES_PER_LONG-1));
dst_idx = src_idx = (unsigned long)p->screen_base & (BYTES_PER_LONG-1);
dst_idx += area->dy*next_line*8 + area->dx*p->var.bits_per_pixel;
src_idx += area->sy*next_line*8 + area->sx*p->var.bits_per_pixel;
+ if (p->fbops->fb_sync)
+ p->fbops->fb_sync(p);
if (rev_copy) {
while (area->height--) {
dst_idx -= next_line*8;
@@ -383,8 +410,9 @@
dst_idx &= (BYTES_PER_LONG-1);
src += src_idx >> SHIFT_PER_LONG;
src_idx &= (BYTES_PER_LONG-1);
- bitcpy_rev((unsigned long*)dst, dst_idx, (unsigned long *)src,
- src_idx, area->width*p->var.bits_per_pixel);
+ bitcpy_rev((unsigned long*)dst, dst_idx,
+ (unsigned long *)src, src_idx,
+ area->width*p->var.bits_per_pixel);
}
} else {
while (area->height--) {
@@ -392,8 +420,9 @@
dst_idx &= (BYTES_PER_LONG-1);
src += src_idx >> SHIFT_PER_LONG;
src_idx &= (BYTES_PER_LONG-1);
- bitcpy((unsigned long*)dst, dst_idx, (unsigned long *)src,
- src_idx, area->width*p->var.bits_per_pixel);
+ bitcpy((unsigned long*)dst, dst_idx,
+ (unsigned long *)src, src_idx,
+ area->width*p->var.bits_per_pixel);
dst_idx += next_line*8;
src_idx += next_line*8;
}
diff -Naur linux-2.5.56-fbdev/drivers/video/cfbfillrect.c linux/drivers/video/cfbfillrect.c
--- linux-2.5.56-fbdev/drivers/video/cfbfillrect.c 2003-01-14 11:34:32.000000000 +0000
+++ linux/drivers/video/cfbfillrect.c 2003-01-14 01:21:46.000000000 +0000
@@ -99,7 +99,8 @@
* the correct start position
*/
-static inline unsigned long pixel_to_pat(const struct fb_info *p, pixel_t pixel, int left)
+static inline unsigned long pixel_to_pat(const struct fb_info *p,
+ pixel_t pixel, int left)
{
unsigned long pat = pixel;
u32 bpp = p->var.bits_per_pixel;
@@ -373,7 +374,8 @@
vxres = p->var.xres_virtual;
vyres = p->var.yres_virtual;
- if (!rect->width || !rect->height || rect->dx > vxres || rect->dy > vyres)
+ if (!rect->width || !rect->height ||
+ rect->dx > vxres || rect->dy > vyres)
return;
/* We could use hardware clipping but on many cards you get around
@@ -392,14 +394,18 @@
else
fg = rect->color;
- dst = (unsigned long *)((unsigned long)p->screen_base & ~(BYTES_PER_LONG-1));
+ dst = (unsigned long *)((unsigned long)p->screen_base &
+ ~(BYTES_PER_LONG-1));
dst_idx = ((unsigned long)p->screen_base & (BYTES_PER_LONG-1))*8;
dst_idx += rect->dy*p->fix.line_length*8+rect->dx*bpp;
/* FIXME For now we support 1-32 bpp only */
left = BITS_PER_LONG % bpp;
+ if (p->fbops->fb_sync)
+ p->fbops->fb_sync(p);
if (!left) {
u32 pat = pixel_to_pat32(p, fg);
- void (*fill_op32)(unsigned long *dst, int dst_idx, u32 pat, u32 n) = NULL;
+ void (*fill_op32)(unsigned long *dst, int dst_idx, u32 pat,
+ u32 n) = NULL;
switch (rect->rop) {
case ROP_XOR:
@@ -420,8 +426,9 @@
unsigned long pat = pixel_to_pat(p, fg, (left-dst_idx) % bpp);
int right = bpp-left;
int r;
- void (*fill_op)(unsigned long *dst, int dst_idx, unsigned long pat,
- int left, int right, u32 n) = NULL;
+ void (*fill_op)(unsigned long *dst, int dst_idx,
+ unsigned long pat, int left, int right,
+ u32 n) = NULL;
switch (rect->rop) {
case ROP_XOR:
@@ -435,7 +442,8 @@
while (height--) {
dst += dst_idx >> SHIFT_PER_LONG;
dst_idx &= (BITS_PER_LONG-1);
- fill_op(dst, dst_idx, pat, left, right, rect->width*bpp);
+ fill_op(dst, dst_idx, pat, left, right,
+ rect->width*bpp);
r = (p->fix.line_length*8) % bpp;
pat = pat << (bpp-r) | pat >> r;
dst_idx += p->fix.line_length*8;
diff -Naur linux-2.5.56-fbdev/drivers/video/cfbimgblt.c linux/drivers/video/cfbimgblt.c
--- linux-2.5.56-fbdev/drivers/video/cfbimgblt.c 2003-01-14 11:34:27.000000000 +0000
+++ linux/drivers/video/cfbimgblt.c 2003-01-14 01:21:42.000000000 +0000
@@ -19,10 +19,6 @@
* up to the nearest byte. For example a bitmap 12 bits wide must be two
* bytes width.
*
- * FIXME
- * The code for 24 bit is horrible. It copies byte by byte size instead of
- * longs like the other sizes. Needs to be optimized.
- *
* Tony:
* Incorporate mask tables similar to fbcon-cfb*.c in 2.4 API. This speeds
* up the code significantly.
@@ -32,7 +28,6 @@
*
* Also need to add code to deal with cards endians that are different than
* the native cpu endians. I also need to deal with MSB position in the word.
- *
*/
#include <linux/config.h>
#include <linux/module.h>
@@ -88,18 +83,21 @@
#if defined (__BIG_ENDIAN)
#define LEFT_POS(bpp) (BITS_PER_LONG - bpp)
+#define LEFT_POS32(bpp) (32 - bpp)
#define NEXT_POS(pos, bpp) ((pos) -= (bpp))
#define SHIFT_HIGH(val, bits) ((val) >> (bits))
#define SHIFT_LOW(val, bits) ((val) << (bits))
#else
#define LEFT_POS(bpp) (0)
+#define LEFT_POS32(bpp) (0)
#define NEXT_POS(pos, bpp) ((pos) += (bpp))
#define SHIFT_HIGH(val, bits) ((val) << (bits))
#define SHIFT_LOW(val, bits) ((val) >> (bits))
#endif
-static inline void color_imageblit(struct fb_image *image, struct fb_info *p, u8 *dst1,
- unsigned long start_index, unsigned long pitch_index)
+static inline void color_imageblit(struct fb_image *image, struct fb_info *p,
+ u8 *dst1, unsigned long start_index,
+ unsigned long pitch_index)
{
/* Draw the penguin */
unsigned long *dst, *dst2, color = 0, val, shift;
@@ -116,7 +114,8 @@
val = 0;
if (start_index) {
- unsigned long start_mask = ~(SHIFT_HIGH(~0UL, start_index));
+ unsigned long start_mask = ~(SHIFT_HIGH(~0UL,
+ start_index));
val = FB_READL(dst) & start_mask;
shift = start_index;
@@ -134,7 +133,8 @@
if (shift == null_bits)
val = 0;
else
- val = SHIFT_LOW(color, BITS_PER_LONG - shift);
+ val = SHIFT_LOW(color, BITS_PER_LONG -
+ shift);
}
shift += bpp;
shift &= (BITS_PER_LONG - 1);
@@ -157,60 +157,64 @@
}
}
-static inline void slow_imageblit(struct fb_image *image, struct fb_info *p, u8 *dst1,
- unsigned long fgcolor, unsigned long bgcolor,
- unsigned long start_index, unsigned long pitch_index)
+static inline void slow_imageblit(struct fb_image *image, struct fb_info *p,
+ u8 *dst1, unsigned long fgcolor,
+ unsigned long bgcolor,
+ unsigned long start_index,
+ unsigned long pitch_index)
{
- unsigned long i, j, l = 8;
+ unsigned long i, j, l;
unsigned long shift, color, bpp = p->var.bits_per_pixel;
unsigned long *dst, *dst2, val, pitch = p->fix.line_length;
unsigned long null_bits = BITS_PER_LONG - bpp;
+ unsigned long spitch = (image->width+7)/8;
u8 *src = image->data, *s;
dst2 = (unsigned long *) dst1;
for (i = image->height; i--; ) {
- shift = 0;
- val = 0;
+ shift = val = 0;
+ l = 8;
j = image->width;
dst = (unsigned long *) dst1;
+ s = src;
/* write leading bits */
if (start_index) {
- unsigned long start_mask = ~(SHIFT_HIGH(~0UL, start_index));
+ unsigned long start_mask = ~(SHIFT_HIGH(~0UL,
+ start_index));
val = FB_READL(dst) & start_mask;
shift = start_index;
}
+
while (j--) {
l--;
- if (*src & (1 << l))
- color = fgcolor;
- else
- color = bgcolor;
+ color = (*s & (1 << l)) ? fgcolor : bgcolor;
color <<= LEFT_POS(bpp);
val |= SHIFT_HIGH(color, shift);
/* Did the bitshift spill bits to the next long? */
if (shift >= null_bits) {
FB_WRITEL(val, dst++);
- if (shift == null_bits)
- val = 0;
- else
- val = SHIFT_LOW(color, BITS_PER_LONG - shift);
+ val = (shift == null_bits) ?
+ 0 : SHIFT_LOW(color, BITS_PER_LONG -
+ shift);
}
shift += bpp;
shift &= (BITS_PER_LONG - 1);
- if (!l) { l = 8; src++; };
+ if (!l) { l = 8; s++; };
}
+
/* write trailing bits */
if (shift) {
unsigned long end_mask = SHIFT_HIGH(~0UL, shift);
FB_WRITEL((FB_READL(dst) & end_mask) | val, dst);
}
- dst1 += pitch;
+ dst1 += pitch;
+ src += spitch;
if (pitch_index) {
dst2 += pitch;
dst1 = (char *) dst2;
@@ -223,26 +227,33 @@
}
}
-static inline void fast_imageblit(struct fb_image *image, struct fb_info *p, u8 *dst1,
- unsigned long fgcolor, unsigned long bgcolor)
+/*
+ * fast_imageblit - optimized monochrome color expansion
+ *
+ * Only if: bits_per_pixel == 8, 16, or 32
+ * image->width is divisible by pixel/dword (ppw);
+ * fix->next_line is divisible by 4;
+ * beginning and end of a scanline is dword aligned
+ */
+static inline void fast_imageblit(struct fb_image *image, struct fb_info *p,
+ u8 *dst1, u32 fgcolor, u32 bgcolor)
{
- int i, j, k, l = 8, n;
- unsigned long bit_mask, end_mask, eorx;
- unsigned long fgx = fgcolor, bgx = bgcolor, pad, bpp = p->var.bits_per_pixel;
- unsigned long tmp = (1 << bpp) - 1;
- unsigned long ppw = BITS_PER_LONG/bpp, ppos;
- unsigned long *dst;
+ int i, j, k;
+ u32 bit_mask, end_mask, eorx, shift;
+ u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
+ u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
+ u32 *dst;
u32 *tab = NULL;
- char *src = image->data;
+ char *s = image->data, *src;
- switch (ppw) {
- case 4:
+ switch (bpp) {
+ case 8:
tab = cfb_tab8;
break;
- case 2:
+ case 16:
tab = cfb_tab16;
break;
- case 1:
+ case 32:
tab = cfb_tab32;
break;
}
@@ -254,38 +265,20 @@
bgx |= bgcolor;
}
- n = ((image->width + 7) / 8);
- pad = (n * 8) - image->width;
- n = image->width % ppw;
-
bit_mask = (1 << ppw) - 1;
eorx = fgx ^ bgx;
-
k = image->width/ppw;
for (i = image->height; i--; ) {
- dst = (unsigned long *) dst1;
-
+ dst = (u32 *) dst1; shift = 8; src = s;
for (j = k; j--; ) {
- l -= ppw;
- end_mask = tab[(*src >> l) & bit_mask];
- FB_WRITEL((end_mask & eorx)^bgx, dst++);
- if (!l) { l = 8; src++; }
+ shift -= ppw;
+ end_mask = tab[(*src >> shift) & bit_mask];
+ fb_writel((end_mask & eorx)^bgx, dst++);
+ if (!shift) { shift = 8; src++; }
}
- if (n) {
- end_mask = 0;
- ppos = LEFT_POS(bpp);
- for (j = n; j > 0; j--) {
- l--;
- if (*src & (1 << l))
- end_mask |= tmp << ppos;
- NEXT_POS(ppos, bpp);
- if (!l) { l = 8; src++; }
- }
- FB_WRITEL((end_mask & eorx)^bgx, dst++);
- }
- l -= pad;
- dst1 += p->fix.line_length;
+ dst1 += p->fix.line_length;
+ s += spitch;
}
}
@@ -299,8 +292,9 @@
vxres = p->var.xres_virtual;
vyres = p->var.yres_virtual;
/*
- * We could use hardware clipping but on many cards you get around hardware
- * clipping by writing to framebuffer directly like we are doing here.
+ * We could use hardware clipping but on many cards you get around
+ * hardware clipping by writing to framebuffer directly like we are
+ * doing here.
*/
if (image->dx > vxres ||
image->dy > vyres)
@@ -323,21 +317,25 @@
bitstart &= ~(bpl - 1);
dst1 = p->screen_base + bitstart;
+ if (p->fbops->fb_sync)
+ p->fbops->fb_sync(p);
if (image->depth == 1) {
if (p->fix.visual == FB_VISUAL_TRUECOLOR ||
p->fix.visual == FB_VISUAL_DIRECTCOLOR) {
- fgcolor = ((u32 *)(p->pseudo_palette))[image->fg_color];
- bgcolor = ((u32 *)(p->pseudo_palette))[image->bg_color];
+ fgcolor = ((u32*)(p->pseudo_palette))[image->fg_color];
+ bgcolor = ((u32*)(p->pseudo_palette))[image->bg_color];
} else {
fgcolor = image->fg_color;
bgcolor = image->bg_color;
}
- if (BITS_PER_LONG % bpp == 0 && !start_index && !pitch_index &&
- bpp >= 8 && bpp <= 32 && (image->width & 7) == 0)
+ if (BITS_PER_LONG % bpp == 0 && !start_index &&
+ !pitch_index && bpp >= 8 && bpp <= 32 &&
+ (image->width & (32/bpp-1)) == 0)
fast_imageblit(image, p, dst1, fgcolor, bgcolor);
else
- slow_imageblit(image, p, dst1, fgcolor, bgcolor, start_index, pitch_index);
+ slow_imageblit(image, p, dst1, fgcolor, bgcolor,
+ start_index, pitch_index);
}
else if (image->depth == bpp)
color_imageblit(image, p, dst1, start_index, pitch_index);
diff -Naur linux-2.5.56-fbdev/drivers/video/fbmem.c linux/drivers/video/fbmem.c
--- linux-2.5.56-fbdev/drivers/video/fbmem.c 2003-01-14 11:34:40.000000000 +0000
+++ linux/drivers/video/fbmem.c 2003-01-14 01:21:53.000000000 +0000
@@ -656,6 +656,8 @@
count = info->fix.smem_len;
if (count + p > info->fix.smem_len)
count = info->fix.smem_len - p;
+ if (info->fbops->fb_sync)
+ info->fbops->fb_sync(info);
if (count) {
char *base_addr;
@@ -692,6 +694,8 @@
count = info->fix.smem_len - p;
err = -ENOSPC;
}
+ if (info->fbops->fb_sync)
+ info->fbops->fb_sync(info);
if (count) {
char *base_addr;
|
|
From: James S. <jsi...@in...> - 2003-01-15 00:27:51
|
Applied. > c. Fix for fast_imageblit() so it always refer to mask tables in 32-bits > which should make it work for 64-bit machines. Ug. I rather try yo take advantge of using the full 64 bits of data to pass across the bus. What I was think is treat the 64 bit case as two 32 bit cases. The 64 bit data comes in and we run the data twice at tabs[]. |
|
From: Antonino D. <ad...@po...> - 2003-01-15 02:19:53
|
On Wed, 2003-01-15 at 08:26, James Simmons wrote:
>
> Applied.
>
> > c. Fix for fast_imageblit() so it always refer to mask tables in 32-bits
> > which should make it work for 64-bit machines.
>
> Ug. I rather try yo take advantge of using the full 64 bits of data to
> pass across the bus. What I was think is treat the 64 bit case as two 32
> bit cases. The 64 bit data comes in and we run the data twice at tabs[].
>
Hi James,
Yes, I was trying to find a way to make fast_imageblit() be fast for all
machine architectures. With the patch attached, there's
fast_imageblit32() and fast_imageblit64(). fast_imageblit32() is
probably slower than fast_imageblit64 on 64-bit machines and, on the
other hand, fast_imageblit64() is 20% slower on 32-bit machines, but is
probably faster on 64-bit and higher machines. So, the only way I can
think of doing this on all machine architectures is to have them go
separate paths.
Note: both fast_imageblit32() and fast_imageblit64(), in theory, should
work will all machine archs. Your call.
Tony
PS: the diff should be applied with the previous patch I submitted.
diff -Naur linux-2.5.56-fbdev/drivers/video/cfbimgblt.c linux/drivers/video/cfbimgblt.c
--- linux-2.5.56-fbdev/drivers/video/cfbimgblt.c 2003-01-15 01:56:47.000000000 +0000
+++ linux/drivers/video/cfbimgblt.c 2003-01-15 01:57:01.000000000 +0000
@@ -74,11 +74,13 @@
};
#if BITS_PER_LONG == 32
-#define FB_WRITEL fb_writel
-#define FB_READL fb_readl
+#define FB_WRITEL fb_writel
+#define FB_READL fb_readl
+#define FAST_IMAGEBLIT fast_imageblit32
#else
-#define FB_WRITEL fb_writeq
-#define FB_READL fb_readq
+#define FB_WRITEL fb_writeq
+#define FB_READL fb_readq
+#define FAST_IMAGEBLIT fast_imageblit64
#endif
#if defined (__BIG_ENDIAN)
@@ -235,15 +237,16 @@
* fix->next_line is divisible by 4;
* beginning and end of a scanline is dword aligned
*/
-static inline void fast_imageblit(struct fb_image *image, struct fb_info *p,
- u8 *dst1, u32 fgcolor, u32 bgcolor)
+#if BITS_PER_LONG == 32
+static inline void fast_imageblit32(struct fb_image *image, struct fb_info *p,
+ u8 *dst1, u32 fgcolor, u32 bgcolor)
{
int i, j, k;
u32 bit_mask, end_mask, eorx, shift;
u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
- u32 *dst;
u32 *tab = NULL;
+ u32 *dst;
char *s = image->data, *src;
switch (bpp) {
@@ -281,7 +284,61 @@
s += spitch;
}
}
+#else
+static inline void fast_imageblit64(struct fb_image *image, struct fb_info *p,
+ u8 *dst1, u32 fgcolor, u32 bgcolor)
+{
+ int i, j, k;
+ u32 bit_mask, end_mask, eorx, shift;
+ u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
+ u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
+ u32 *tab = NULL, bpl;
+ unsigned long *dst, val;
+ char *s = image->data, *src;
+
+ switch (bpp) {
+ case 8:
+ tab = cfb_tab8;
+ break;
+ case 16:
+ tab = cfb_tab16;
+ break;
+ case 32:
+ tab = cfb_tab32;
+ break;
+ }
+
+ for (i = ppw-1; i--; ) {
+ fgx <<= bpp;
+ bgx <<= bpp;
+ fgx |= fgcolor;
+ bgx |= bgcolor;
+ }
+ bit_mask = (1 << ppw) - 1;
+ eorx = fgx ^ bgx;
+ k = image->width/ppw;
+
+ for (i = image->height; i--; ) {
+ dst = (unsigned long *) dst1; shift = 8; src = s;
+ val = 0, bpl = 0;
+ for (j = k; j--; ) {
+ shift -= ppw;
+ end_mask = tab[(*src >> shift) & bit_mask];
+ val |= SHIFT_HIGH((end_mask & eorx)^bgx, bpl);
+ bpl += 32;
+ bpl &= BITS_PER_LONG - 1;
+ if (!bpl) {
+ FB_WRITEL(val, dst++);
+ val = 0;
+ }
+ if (!shift) { shift = 8; src++; }
+ }
+ dst1 += p->fix.line_length;
+ s += spitch;
+ }
+}
+#endif
void cfb_imageblit(struct fb_info *p, struct fb_image *image)
{
int x2, y2, vxres, vyres;
@@ -331,8 +388,8 @@
if (BITS_PER_LONG % bpp == 0 && !start_index &&
!pitch_index && bpp >= 8 && bpp <= 32 &&
- (image->width & (32/bpp-1)) == 0)
- fast_imageblit(image, p, dst1, fgcolor, bgcolor);
+ (image->width & (BITS_PER_LONG/bpp-1)) == 0)
+ FAST_IMAGEBLIT(image, p, dst1, fgcolor, bgcolor);
else
slow_imageblit(image, p, dst1, fgcolor, bgcolor,
start_index, pitch_index);
|
|
From: Geert U. <ge...@li...> - 2003-01-15 09:30:27
|
On 15 Jan 2003, Antonino Daplas wrote:
> On Wed, 2003-01-15 at 08:26, James Simmons wrote:
> > Applied.
> >
> > > c. Fix for fast_imageblit() so it always refer to mask tables in 32-bits
> > > which should make it work for 64-bit machines.
> >
> > Ug. I rather try yo take advantge of using the full 64 bits of data to
> > pass across the bus. What I was think is treat the 64 bit case as two 32
> > bit cases. The 64 bit data comes in and we run the data twice at tabs[].
> >
> Hi James,
>
> Yes, I was trying to find a way to make fast_imageblit() be fast for all
> machine architectures. With the patch attached, there's
> fast_imageblit32() and fast_imageblit64(). fast_imageblit32() is
> probably slower than fast_imageblit64 on 64-bit machines and, on the
> other hand, fast_imageblit64() is 20% slower on 32-bit machines, but is
> probably faster on 64-bit and higher machines. So, the only way I can
> think of doing this on all machine architectures is to have them go
> separate paths.
Can't you merge fast_imageblit32() and fast_imageblit64() a bit more (with some
#ifdef's), and just call the result fast_imageblit()? Then the definition of
FAST_IMAGEBLIT can go away.
u32 is the same as unsigned long if BITS_PER_LONG == 32.
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@li...
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
|
|
From: Antonino D. <ad...@po...> - 2003-01-15 11:57:16
|
On Wed, 2003-01-15 at 17:28, Geert Uytterhoeven wrote:
> On 15 Jan 2003, Antonino Daplas wrote:
> > On Wed, 2003-01-15 at 08:26, James Simmons wrote:
> > > Applied.
> > >
> > > > c. Fix for fast_imageblit() so it always refer to mask tables in 32-bits
> > > > which should make it work for 64-bit machines.
> > >
> > > Ug. I rather try yo take advantge of using the full 64 bits of data to
> > > pass across the bus. What I was think is treat the 64 bit case as two 32
> > > bit cases. The 64 bit data comes in and we run the data twice at tabs[].
> > >
> > Hi James,
> >
> > Yes, I was trying to find a way to make fast_imageblit() be fast for all
> > machine architectures. With the patch attached, there's
> > fast_imageblit32() and fast_imageblit64(). fast_imageblit32() is
> > probably slower than fast_imageblit64 on 64-bit machines and, on the
> > other hand, fast_imageblit64() is 20% slower on 32-bit machines, but is
> > probably faster on 64-bit and higher machines. So, the only way I can
> > think of doing this on all machine architectures is to have them go
> > separate paths.
>
> Can't you merge fast_imageblit32() and fast_imageblit64() a bit more (with some
> #ifdef's), and just call the result fast_imageblit()? Then the definition of
> FAST_IMAGEBLIT can go away.
>
> u32 is the same as unsigned long if BITS_PER_LONG == 32.
>
That's true. I don't want to do the merge before you people have seen
it. Anyway, here's an updated one.
Tony
diff -Naur linux-2.5.56-fbdev/drivers/video/cfbimgblt.c linux/drivers/video/cfbimgblt.c
--- linux-2.5.56-fbdev/drivers/video/cfbimgblt.c 2003-01-15 01:56:47.000000000 +0000
+++ linux/drivers/video/cfbimgblt.c 2003-01-15 11:43:53.000000000 +0000
@@ -73,14 +73,6 @@
0x00000000, 0xffffffff
};
-#if BITS_PER_LONG == 32
-#define FB_WRITEL fb_writel
-#define FB_READL fb_readl
-#else
-#define FB_WRITEL fb_writeq
-#define FB_READL fb_readq
-#endif
-
#if defined (__BIG_ENDIAN)
#define LEFT_POS(bpp) (BITS_PER_LONG - bpp)
#define LEFT_POS32(bpp) (32 - bpp)
@@ -95,6 +87,28 @@
#define SHIFT_LOW(val, bits) ((val) >> (bits))
#endif
+#if BITS_PER_LONG == 32
+#define FB_WRITEL fb_writel
+#define FB_READL fb_readl
+#define DECLARE_FASTPATH {}
+#define INIT_FASTPATH {}
+#define FASTPATH fb_writel((end_mask & eorx)^bgx, dst++)
+#else
+#define FB_WRITEL fb_writeq
+#define FB_READL fb_readq
+#define DECLARE_FASTPATH unsigned long val, bpl
+#define INIT_FASTPATH { val = 0; bpl = 0; }
+#define FASTPATH { \
+ val |= SHIFT_HIGH((end_mask & eorx)^bgx, bpl); \
+ bpl += 32; \
+ bpl &= BITS_PER_LONG - 1; \
+ if (!bpl) { \
+ FB_WRITEL(val, dst++); \
+ val = 0; \
+ } \
+}
+#endif
+
static inline void color_imageblit(struct fb_image *image, struct fb_info *p,
u8 *dst1, unsigned long start_index,
unsigned long pitch_index)
@@ -242,10 +256,11 @@
u32 bit_mask, end_mask, eorx, shift;
u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
- u32 *dst;
u32 *tab = NULL;
+ unsigned long *dst;
char *s = image->data, *src;
-
+ DECLARE_FASTPATH;
+
switch (bpp) {
case 8:
tab = cfb_tab8;
@@ -270,18 +285,19 @@
k = image->width/ppw;
for (i = image->height; i--; ) {
- dst = (u32 *) dst1; shift = 8; src = s;
+ dst = (unsigned long *) dst1; shift = 8; src = s;
+ INIT_FASTPATH;
for (j = k; j--; ) {
shift -= ppw;
end_mask = tab[(*src >> shift) & bit_mask];
- fb_writel((end_mask & eorx)^bgx, dst++);
+ FASTPATH;
if (!shift) { shift = 8; src++; }
}
dst1 += p->fix.line_length;
s += spitch;
}
}
-
+
void cfb_imageblit(struct fb_info *p, struct fb_image *image)
{
int x2, y2, vxres, vyres;
@@ -331,7 +347,7 @@
if (BITS_PER_LONG % bpp == 0 && !start_index &&
!pitch_index && bpp >= 8 && bpp <= 32 &&
- (image->width & (32/bpp-1)) == 0)
+ (image->width & (BITS_PER_LONG/bpp-1)) == 0)
fast_imageblit(image, p, dst1, fgcolor, bgcolor);
else
slow_imageblit(image, p, dst1, fgcolor, bgcolor,
|