summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2014-09-02 16:21:44 -0400
committerOri Bernstein <ori@eigenstate.org>2014-09-02 16:21:44 -0400
commit84d48fceb5f71ed438aec8b7fa0576f0b9c7eb66 (patch)
treeafe0204d77b797d65db12dcfaa3a65035b013d23
parent094074d7ecb008cd3d59dd0d011387299a9fe9c8 (diff)
downloadmc-84d48fceb5f71ed438aec8b7fa0576f0b9c7eb66.tar.gz
Switch to much faster register based blitting.
Most values are small. Don't use rep movs. It's fast on many uarches, but has high fixed cost. 5x speedup in intsort. Fuck yeah.
-rw-r--r--6/isel.c44
-rw-r--r--bench/Makefile11
2 files changed, 33 insertions, 22 deletions
diff --git a/6/isel.c b/6/isel.c
index 810c8de..c951a3e 100644
--- a/6/isel.c
+++ b/6/isel.c
@@ -374,25 +374,31 @@ static Loc *memloc(Isel *s, Node *e, Mode m)
static void blit(Isel *s, Loc *to, Loc *from, size_t dstoff, size_t srcoff, size_t sz)
{
- Loc *sp, *dp, *len; /* pointers to src, dst */
-
- len = loclit(sz, ModeQ);
- sp = newr(s, from);
- dp = newr(s, to);
-
- /* length to blit */
- g(s, Imov, len, locphysreg(Rrcx), NULL);
- /* source address with offset */
- if (srcoff)
- g(s, Ilea, locmem(srcoff, sp, NULL, ModeQ), locphysreg(Rrsi), NULL);
- else
- g(s, Imov, sp, locphysreg(Rrsi), NULL);
- /* dest address with offset */
- if (dstoff)
- g(s, Ilea, locmem(dstoff, dp, NULL, ModeQ), locphysreg(Rrdi), NULL);
- else
- g(s, Imov, dp, locphysreg(Rrdi), NULL);
- g(s, Irepmovsb, NULL);
+ size_t i, sz;
+ Loc *sp, *dp; /* pointers to src, dst */
+ Loc *tmp, *src, *dst; /* source memory, dst memory */
+
+ sp = inr(s, from);
+ dp = inr(s, to);
+
+ /* Slightly funny loop condition: We might have trailing bytes
+ * that we can't blit word-wise. */
+ tmp = locreg(ModeQ);
+ for (i = 0; i < sz/Ptrsz; i++) {
+ src = locmem(i*Ptrsz + srcoff, sp, NULL, ModeQ);
+ dst = locmem(i*Ptrsz + dstoff, dp, NULL, ModeQ);
+ g(s, Imov, src, tmp, NULL);
+ g(s, Imov, tmp, dst, NULL);
+ }
+ /* now, the trailing bytes */
+ tmp = locreg(ModeB);
+ i *= Ptrsz; /* we counted in Ptrsz chunks; now we need a byte offset */
+ for (; i < sz; i++) {
+ src = locmem(i, sp, NULL, ModeB);
+ dst = locmem(i, dp, NULL, ModeB);
+ g(s, Imov, src, tmp, NULL);
+ g(s, Imov, tmp, dst, NULL);
+ }
}
static int isfunc(Isel *s, Node *n)
diff --git a/bench/Makefile b/bench/Makefile
index 25ce7ad..1f8a3fe 100644
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -6,8 +6,13 @@ BENCHSRC=intsort.myr \
include ../config.mk
include ../mk/c.mk
-bench: runner $(BENCHSRC:.myr=)
+bench: runner cleanbuild
./runner $(BENCHSRC:.myr=)
-$(BENCHSRC:.myr=): $(BENCHSRC)
- ../myrbuild/myrbuild -b $@ $@.myr
+.PHONY: cleanbuild
+cleanbuild:
+ rm -f $(BENCHSRC:.myr=) $(BENCHSRC:.myr=.o) $(BENCHSRC:.myr=.use)
+ @for i in $(BENCHSRC:.myr=); do \
+ ../myrbuild/myrbuild -b $$i $$i.myr; \
+ done
+