From d8834a1323af72f6145bc81adadd75185ef6065f Mon Sep 17 00:00:00 2001 From: Matthias Weisser Date: Thu, 10 Mar 2011 21:36:32 +0000 Subject: arm: Use optimized memcpy and memset from linux Using optimized versions of memset and memcpy from linux brings a quite noticeable speed (x2 or better) improvement for these two functions. Here are some numbers for test done with jadecpu | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| | | +patch | | +patch | ---------------------------+--------+--------+--------+--------+ Reset to prompt | 438ms | 330ms | 228ms | 120ms | | | | | | TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | | | | | | FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | | | | | | BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | where CRC is | 615ms | 615ms | 54ms | 54ms | uncompress | 2460ms | 2462ms | 450ms | 451ms | final boot_elf | 376ms | 68ms | 65ms | 65ms | | | | | | BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | where CRC is | 600ms | 600ms | 135ms | 135ms | uncompress | 2209ms | 2211ms | 828ms | 828ms | | | | | | Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | (1) No dcache (2) dcache enabled in board_init *Does not work when dcache is on Size impact: C version: text data bss dec hex filename 202862 18912 266456 488230 77326 u-boot ASM version: text data bss dec hex filename 203798 18912 266288 488998 77626 u-boot 222712 u-boot.bin Signed-off-by: Matthias Weisser --- arch/arm/lib/Makefile | 2 + arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/arm/lib/memset.S | 126 ++++++++++++++++++++++++++ 3 files changed, 369 insertions(+) create mode 100644 arch/arm/lib/memcpy.S create mode 100644 arch/arm/lib/memset.S (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 454440c..03b1b5e 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o endif COBJS-y += interrupts.o COBJS-y += reset.o +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S new file mode 100644 index 0000000..40db90e --- /dev/null +++ b/arch/arm/lib/memcpy.S @@ -0,0 +1,241 @@ +/* + * linux/arch/arm/lib/memcpy.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#define W(instr) instr + +#define LDR1W_SHIFT 0 +#define STR1W_SHIFT 0 + + .macro ldr1w ptr reg abort + W(ldr) \reg, [\ptr], #4 + .endm + + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} + .endm + + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro ldr1b ptr reg cond=al abort + ldr\cond\()b \reg, [\ptr], #1 + .endm + + .macro str1w ptr reg abort + W(str) \reg, [\ptr], #4 + .endm + + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro str1b ptr reg cond=al abort + str\cond\()b \reg, [\ptr], #1 + .endm + + .macro enter reg1 reg2 + stmdb sp!, {r0, \reg1, \reg2} + .endm + + .macro exit reg1 reg2 + ldmfd sp!, {r0, \reg1, \reg2} + .endm + + .text + +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ + +.globl memcpy +memcpy: + + enter r4, lr + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #0] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 4f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +3: PLD( pld [r1, #124] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #32 + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 +#if LDR1W_SHIFT > 0 + lsl ip, ip, #LDR1W_SHIFT +#endif + addne pc, pc, ip @ C is always clear here + b 7f +6: + .rept (1 << LDR1W_SHIFT) + W(nop) + .endr + ldr1w r1, r3, abort=20f + ldr1w r1, r4, abort=20f + ldr1w r1, r5, abort=20f + ldr1w r1, r6, abort=20f + ldr1w r1, r7, abort=20f + ldr1w r1, r8, abort=20f + ldr1w r1, lr, abort=20f + +#if LDR1W_SHIFT < STR1W_SHIFT + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT +#elif LDR1W_SHIFT > STR1W_SHIFT + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT +#endif + add pc, pc, ip + nop + .rept (1 << STR1W_SHIFT) + W(nop) + .endr + str1w r0, r3, abort=20f + str1w r0, r4, abort=20f + str1w r0, r5, abort=20f + str1w r0, r6, abort=20f + str1w r0, r7, abort=20f + str1w r0, r8, abort=20f + str1w r0, lr, abort=20f + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldr1b r1, r3, ne, abort=21f + ldr1b r1, r4, cs, abort=21f + ldr1b r1, ip, cs, abort=21f + str1b r0, r3, ne, abort=21f + str1b r0, r4, cs, abort=21f + str1b r0, ip, cs, abort=21f + + exit r4, pc + +9: rsb ip, ip, #4 + cmp ip, #2 + ldr1b r1, r3, gt, abort=21f + ldr1b r1, r4, ge, abort=21f + ldr1b r1, lr, abort=21f + str1b r0, r3, gt, abort=21f + str1b r0, r4, ge, abort=21f + subs r2, r2, ip + str1b r0, lr, abort=21f + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr1w r1, lr, abort=21f + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #0] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 13f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +12: PLD( pld [r1, #124] ) +13: ldr4w r1, r4, r5, r6, r7, abort=19f + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldr4w r1, r8, r9, ip, lr, abort=19f + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr1w r1, lr, abort=21f + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str1w r0, r3, abort=21f + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S new file mode 100644 index 0000000..0cdf895 --- /dev/null +++ b/arch/arm/lib/memset.S @@ -0,0 +1,126 @@ +/* + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ASM optimised string functions + */ +#include + + .text + .align 5 + .word 0 + +1: subs r2, r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [r0], #1 @ 1 + strleb r1, [r0], #1 @ 1 + strb r1, [r0], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) +/* + * The pointer is now aligned and the length is adjusted. Try doing the + * memset again. + */ + +.globl memset +memset: + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 4f + +#if ! CALGN(1)+0 + +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +2: subs r2, r2, #64 + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + bgt 2b + ldmeqfd sp!, {pc} @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmneia r0!, {r1, r3, ip, lr} + stmneia r0!, {r1, r3, ip, lr} + tst r2, #16 + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines at once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov ip, r1 + mov lr, r1 + + cmp r2, #96 + tstgt r0, #31 + ble 3f + + and ip, r0, #31 + rsb ip, ip, #32 + sub r2, r2, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + tst ip, #(1 << 30) + mov ip, r1 + strne r1, [r0], #4 + +3: subs r2, r2, #64 + stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia r0!, {r1, r3-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r2, #32 + stmneia r0!, {r1, r3-r7, ip, lr} + tst r2, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + +4: tst r2, #8 + stmneia r0!, {r1, r3} + tst r2, #4 + strne r1, [r0], #4 +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +5: tst r2, #2 + strneb r1, [r0], #1 + strneb r1, [r0], #1 + tst r2, #1 + strneb r1, [r0], #1 + mov pc, lr -- cgit v1.1 From 6445a3051efadcac1264998cb6387cd1aec2e935 Mon Sep 17 00:00:00 2001 From: Eric Cooper Date: Thu, 14 Apr 2011 12:32:37 +0000 Subject: ARM: fix stack pointer adjustment in board_init_f() Since addr_sp is a byte address, it should be adjusted by 12 here. Signed-off-by: Eric Cooper Cc: Albert ARIBAUD Acked-by: Wolfgang Denk --- arch/arm/lib/board.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/board.c b/arch/arm/lib/board.c index dc46e21..d5b34ad 100644 --- a/arch/arm/lib/board.c +++ b/arch/arm/lib/board.c @@ -399,7 +399,7 @@ void board_init_f (ulong bootflag) CONFIG_STACKSIZE_IRQ+CONFIG_STACKSIZE_FIQ, addr_sp); #endif /* leave 3 words for abort-stack */ - addr_sp -= 3; + addr_sp -= 12; /* 8-byte alignment for ABI compliance */ addr_sp &= ~0x07; -- cgit v1.1 From d32a1a4caa2a2ca7c385f4489167e170bf7fb5c1 Mon Sep 17 00:00:00 2001 From: Minkyu Kang Date: Sun, 24 Apr 2011 22:22:34 +0000 Subject: Don't grab memory for LCD if FB address is defined If FB address is defined specific address then don't grab memory for LCD Signed-off-by: Minkyu Kang Cc: Albert Aribaud Cc: Wolfgang Denk Cc: Stefan Roese Cc: Kim Phillips Cc: Andy Fleming Cc: Kumar Gala --- arch/arm/lib/board.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/board.c b/arch/arm/lib/board.c index d5b34ad..1a784a1 100644 --- a/arch/arm/lib/board.c +++ b/arch/arm/lib/board.c @@ -356,9 +356,13 @@ void board_init_f (ulong bootflag) #endif /* CONFIG_VFD */ #ifdef CONFIG_LCD +#ifdef CONFIG_FB_ADDR + gd->fb_base = CONFIG_FB_ADDR; +#else /* reserve memory for LCD display (always full pages) */ addr = lcd_setmem (addr); gd->fb_base = addr; +#endif /* CONFIG_FB_ADDR */ #endif /* CONFIG_LCD */ /* -- cgit v1.1