Discussion:
[PATCH] Add zero-overhead looping for xtensa backend
Felix Yang
2014-01-08 16:27:14 UTC
Permalink
Hi Sterling,

This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
If OK for trunk, please apply it for me. Thanks.


Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog (revision 206431)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,18 @@
+2014-01-08 Felix Yang <***@gmail.com>
+
+ * config/xtensa/xtensa.c (xtensa_reorg): New.
+ (xtensa_reorg_loops): New.
+ (xtensa_can_use_doloop_p): New.
+ (xtensa_invalid_within_doloop): New.
+ (hwloop_optimize): New.
+ (hwloop_fail): New.
+ (hwloop_pattern_reg): New.
+ (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+ (xtensa_doloop_hooks): Define.
+ * config/xtensa/xtensa.md (doloop_end): New.
+ (zero_cost_loop_start): Rewritten.
+ (zero_cost_loop_end): Rewritten.
+
2014-01-08 Marek Polacek <***@redhat.com>

PR middle-end/59669
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md (revision 206431)
+++ gcc/config/xtensa/xtensa.md (working copy)
@@ -35,6 +35,8 @@
(UNSPEC_TLS_CALL 9)
(UNSPEC_TP 10)
(UNSPEC_MEMW 11)
+ (UNSPEC_LSETUP_START 12)
+ (UNSPEC_LSETUP_END 13)

(UNSPECV_SET_FP 1)
(UNSPECV_ENTRY 2)
@@ -1289,6 +1291,8 @@
(set_attr "length" "3")])


+;; Hardware loop support.
+
;; Define the loop insns used by bct optimization to represent the
;; start and end of a zero-overhead loop (in loop.c). This start
;; template generates the loop insn; the end template doesn't generate
@@ -1296,34 +1300,58 @@

(define_insn "zero_cost_loop_start"
[(set (pc)
- (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (match_dup 0) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
""
- "loopnez\t%0, %l1"
+ "loop\t%0, %l1_LEND"
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "3")])

(define_insn "zero_cost_loop_end"
[(set (pc)
- (if_then_else (ne (reg:SI 19) (const_int 0))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (reg:SI 19) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
""
{
- xtensa_emit_loop_end (insn, operands);
- return "";
+ xtensa_emit_loop_end (insn, operands);
+ return "";
}
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+ [(parallel [(set (pc) (if_then_else
+ (ne (match_operand:SI 0 "" "")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+ ""
+{
+ /* The loop optimizer doesn't check the predicates... */
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+})
+

;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c (revision 206431)
+++ gcc/config/xtensa/xtensa.c (working copy)
@@ -1,6 +1,7 @@
/* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
Copyright (C) 2001-2014 Free Software Foundation, Inc.
Contributed by Bob Wilson (***@tensilica.com) at Tensilica.
+ Zero-overhead looping support by Felix Yang (***@gmail.com).

This file is part of GCC.

@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3. If not see
#include "gimple.h"
#include "gimplify.h"
#include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"

-
/* Enumeration for all of the relational tests, so that we can build
arrays indexed by the test type, and not worry about the order
of EQ, NE, etc. */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

static bool constantpool_address_p (const_rtx addr);
static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+ unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);

static bool xtensa_member_type_forces_blk (const_tree,
enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
}
}

- output_asm_insn ("# loop end for %0", operands);
+ output_asm_insn ("%1_LEND:", operands);
}


@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
return !xtensa_tls_referenced_p (x);
}

+/* Implement TARGET_CAN_USE_DOLOOP_P. */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+ unsigned int level, bool entered_at_top)
+{
+ /* Considering limitations in the hardware, only use doloop for
innermost loops
+ which must be entered from the top. */
+ if (level != 1 || !entered_at_top)
+ return false;
+
+ return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ return NULL;
+}
+
+/* Optimize LOOP. */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+ int i;
+ edge entry_edge;
+ basic_block entry_bb;
+ rtx insn, seq, iter_reg, entry_after;
+
+ if (loop->depth > 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+ return false;
+ }
+
+ if (!loop->incoming_dest)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->incoming_dest != loop->head)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->has_call || loop->has_asm)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+ return false;
+ }
+
+ /* Scan all the blocks to make sure they don't use iter_reg. */
+ if (loop->iter_reg_used || loop->iter_reg_used_outside)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+ return false;
+ }
+
+ /* Check if start_label appears before doloop_end. */
+ insn = loop->start_label;
+ while (insn && insn != loop->loop_end)
+ insn = NEXT_INSN (insn);
+
+ if (!insn)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Get the loop iteration register. */
+ iter_reg = loop->iter_reg;
+
+ gcc_assert (REG_P (iter_reg));
+
+ entry_edge = NULL;
+
+ FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+ if (entry_edge->flags & EDGE_FALLTHRU)
+ break;
+
+ if (entry_edge == NULL)
+ return false;
+
+ /* Place the zero_cost_loop_start instruction before the loop. */
+ entry_bb = entry_edge->src;
+
+ start_sequence ();
+
+ insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+ loop->start_label,
+ loop->iter_reg));
+
+ seq = get_insns ();
+
+ if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+ {
+ basic_block new_bb;
+ edge e;
+ edge_iterator ei;
+
+ emit_insn_before (seq, BB_HEAD (loop->head));
+ seq = emit_label_before (gen_label_rtx (), seq);
+
+ new_bb = create_basic_block (seq, insn, entry_bb);
+ FOR_EACH_EDGE (e, ei, loop->incoming)
+ {
+ if (!(e->flags & EDGE_FALLTHRU))
+ redirect_edge_and_branch_force (e, new_bb);
+ else
+ redirect_edge_succ (e, new_bb);
+ }
+ make_edge (new_bb, loop->head, 0);
+ }
+ else
+ {
+ entry_after = BB_END (entry_bb);
+ while (DEBUG_INSN_P (entry_after)
+ || (NOTE_P (entry_after)
+ && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+ entry_after = PREV_INSN (entry_after);
+ emit_insn_after (seq, entry_after);
+ }
+
+ end_sequence ();
+
+ return true;
+}
+
+/* A callback for the hw-doloop pass. Called when a loop we have discovered
+ turns out not to be optimizable; we have to split the loop_end pattern into
+ a subtract and a test. */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+ rtx test, insn = loop->loop_end;
+
+ emit_insn_before (gen_addsi3 (loop->iter_reg,
+ loop->iter_reg,
+ constm1_rtx),
+ loop->loop_end);
+
+ test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+ insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+ loop->iter_reg, const0_rtx,
+ loop->start_label),
+ loop->loop_end);
+
+ JUMP_LABEL (insn) = loop->start_label;
+ LABEL_NUSES (loop->start_label)++;
+ delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass. This function examines INSN; if
+ it is a doloop_end pattern we recognize, return the reg rtx for the
+ loop counter. Otherwise, return NULL_RTX. */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+ rtx reg;
+
+ if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+ return NULL_RTX;
+
+ reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+ if (!REG_P (reg))
+ return NULL_RTX;
+ return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+ hwloop_pattern_reg,
+ hwloop_optimize,
+ hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+ and tries to rewrite the RTL of these loops so that proper Xtensa
+ hardware loops are generated. */
+
+static void
+xtensa_reorg_loops (void)
+{
+ reorg_loops (true, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */
+
+static void
+xtensa_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
+ compute_bb_for_insn ();
+
+ df_analyze ();
+
+ /* Doloop optimization. */
+ xtensa_reorg_loops ();
+}
+
#include "gt-xtensa.h"

Cheers,
Felix
Sterling Augustine
2014-01-08 16:49:46 UTC
Permalink
Post by Felix Yang
Hi Sterling,
This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
If OK for trunk, please apply it for me. Thanks.
Hi Felix,

I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.

I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.

It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.

Thanks,

Sterling
Felix Yang
2014-01-09 15:08:36 UTC
Permalink
Hi Sterling,

Attached please find version 2 of the patch.

I applied this updated patch (with small adaptations) to gcc-4.8.2
and carried out some tests.
I can execute the testcases in a simulator, which support
zero-overhead looping instructions.

First of all, I can successfully build libgcc, libstdc++ and
newlibc for xtensa with this patch.
The newly built xtensa gcc also passed testsuite which comes with newlibc.
I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
directory. There are about 800+ cases tested.
Test result shows no new failed case with this patch, compared
with the original gcc version.
Is that OK?

I also double checked the loop relaxation issue with binutils-2.24
(the latest version).
The result show that the assember can do loop relaxation when the
loop target is too far ( > 256 Byte).
And this is the reason why I don't check the size of the loop.


Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog (revision 206463)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,18 @@
+2014-01-09 Felix Yang <***@gmail.com>
+
+ * config/xtensa/xtensa.c (xtensa_reorg): New.
+ (xtensa_reorg_loops): New.
+ (xtensa_can_use_doloop_p): New.
+ (xtensa_invalid_within_doloop): New.
+ (hwloop_optimize): New.
+ (hwloop_fail): New.
+ (hwloop_pattern_reg): New.
+ (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+ (xtensa_doloop_hooks): Define.
+ * config/xtensa/xtensa.md (doloop_end): New.
+ (zero_cost_loop_start): Rewritten.
+ (zero_cost_loop_end): Rewritten.
+
2014-01-09 Richard Biener <***@suse.de>

PR tree-optimization/59715
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md (revision 206463)
+++ gcc/config/xtensa/xtensa.md (working copy)
@@ -1,6 +1,7 @@
;; GCC machine description for Tensilica's Xtensa architecture.
;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
;; Contributed by Bob Wilson (***@tensilica.com) at Tensilica.
+;; Zero-overhead looping support by Felix Yang (***@gmail.com).

;; This file is part of GCC.

@@ -35,6 +36,8 @@
(UNSPEC_TLS_CALL 9)
(UNSPEC_TP 10)
(UNSPEC_MEMW 11)
+ (UNSPEC_LSETUP_START 12)
+ (UNSPEC_LSETUP_END 13)

(UNSPECV_SET_FP 1)
(UNSPECV_ENTRY 2)
@@ -1289,41 +1292,67 @@
(set_attr "length" "3")])


+;; Hardware loop support.
+
;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c). This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop. This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.

(define_insn "zero_cost_loop_start"
[(set (pc)
- (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (match_dup 0) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
""
- "loopnez\t%0, %l1"
+ "loop\t%0, %l1_LEND"
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "3")])

(define_insn "zero_cost_loop_end"
[(set (pc)
- (if_then_else (ne (reg:SI 19) (const_int 0))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (reg:SI 19) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
""
{
- xtensa_emit_loop_end (insn, operands);
- return "";
+ xtensa_emit_loop_end (insn, operands);
+ return "";
}
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+ [(parallel [(set (pc) (if_then_else
+ (ne (match_operand:SI 0 "" "")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+ ""
+{
+ /* The loop optimizer doesn't check the predicates... */
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+})
+

;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c (revision 206463)
+++ gcc/config/xtensa/xtensa.c (working copy)
@@ -1,6 +1,7 @@
/* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
Copyright (C) 2001-2014 Free Software Foundation, Inc.
Contributed by Bob Wilson (***@tensilica.com) at Tensilica.
+ Zero-overhead looping support by Felix Yang (***@gmail.com).

This file is part of GCC.

@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3. If not see
#include "gimple.h"
#include "gimplify.h"
#include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"

-
/* Enumeration for all of the relational tests, so that we can build
arrays indexed by the test type, and not worry about the order
of EQ, NE, etc. */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

static bool constantpool_address_p (const_rtx addr);
static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+ unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);

static bool xtensa_member_type_forces_blk (const_tree,
enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
}
}

- output_asm_insn ("# loop end for %0", operands);
+ output_asm_insn ("%1_LEND:", operands);
}


@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
return !xtensa_tls_referenced_p (x);
}

+/* Implement TARGET_CAN_USE_DOLOOP_P. */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+ unsigned int level, bool entered_at_top)
+{
+ /* Considering limitations in the hardware, only use doloop for
innermost loops
+ which must be entered from the top. */
+ if (level != 1 || !entered_at_top)
+ return false;
+
+ return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ return NULL;
+}
+
+/* Optimize LOOP. */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+ int i;
+ edge entry_edge;
+ basic_block entry_bb;
+ rtx insn, seq, iter_reg, entry_after;
+
+ if (loop->depth > 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+ return false;
+ }
+
+ if (!loop->incoming_dest)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->incoming_dest != loop->head)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->has_call || loop->has_asm)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+ return false;
+ }
+
+ /* Scan all the blocks to make sure they don't use iter_reg. */
+ if (loop->iter_reg_used || loop->iter_reg_used_outside)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+ return false;
+ }
+
+ /* Check if start_label appears before doloop_end. */
+ insn = loop->start_label;
+ while (insn && insn != loop->loop_end)
+ insn = NEXT_INSN (insn);
+
+ if (!insn)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Get the loop iteration register. */
+ iter_reg = loop->iter_reg;
+
+ gcc_assert (REG_P (iter_reg));
+
+ entry_edge = NULL;
+
+ FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+ if (entry_edge->flags & EDGE_FALLTHRU)
+ break;
+
+ if (entry_edge == NULL)
+ return false;
+
+ /* Place the zero_cost_loop_start instruction before the loop. */
+ entry_bb = entry_edge->src;
+
+ start_sequence ();
+
+ insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+ loop->start_label,
+ loop->iter_reg));
+
+ seq = get_insns ();
+
+ if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+ {
+ basic_block new_bb;
+ edge e;
+ edge_iterator ei;
+
+ emit_insn_before (seq, BB_HEAD (loop->head));
+ seq = emit_label_before (gen_label_rtx (), seq);
+
+ new_bb = create_basic_block (seq, insn, entry_bb);
+ FOR_EACH_EDGE (e, ei, loop->incoming)
+ {
+ if (!(e->flags & EDGE_FALLTHRU))
+ redirect_edge_and_branch_force (e, new_bb);
+ else
+ redirect_edge_succ (e, new_bb);
+ }
+ make_edge (new_bb, loop->head, 0);
+ }
+ else
+ {
+ entry_after = BB_END (entry_bb);
+ while (DEBUG_INSN_P (entry_after)
+ || (NOTE_P (entry_after)
+ && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+ entry_after = PREV_INSN (entry_after);
+ emit_insn_after (seq, entry_after);
+ }
+
+ end_sequence ();
+
+ return true;
+}
+
+/* A callback for the hw-doloop pass. Called when a loop we have discovered
+ turns out not to be optimizable; we have to split the loop_end pattern into
+ a subtract and a test. */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+ rtx test, insn = loop->loop_end;
+
+ emit_insn_before (gen_addsi3 (loop->iter_reg,
+ loop->iter_reg,
+ constm1_rtx),
+ loop->loop_end);
+
+ test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+ insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+ loop->iter_reg, const0_rtx,
+ loop->start_label),
+ loop->loop_end);
+
+ JUMP_LABEL (insn) = loop->start_label;
+ LABEL_NUSES (loop->start_label)++;
+ delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass. This function examines INSN; if
+ it is a doloop_end pattern we recognize, return the reg rtx for the
+ loop counter. Otherwise, return NULL_RTX. */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+ rtx reg;
+
+ if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+ return NULL_RTX;
+
+ reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+ if (!REG_P (reg))
+ return NULL_RTX;
+ return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+ hwloop_pattern_reg,
+ hwloop_optimize,
+ hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+ and tries to rewrite the RTL of these loops so that proper Xtensa
+ hardware loops are generated. */
+
+static void
+xtensa_reorg_loops (void)
+{
+ reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */
+
+static void
+xtensa_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
+ compute_bb_for_insn ();
+
+ df_analyze ();
+
+ /* Doloop optimization. */
+ xtensa_reorg_loops ();
+}
+
#include "gt-xtensa.h"
Cheers,
Felix


On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
Post by Sterling Augustine
Post by Felix Yang
Hi Sterling,
This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
If OK for trunk, please apply it for me. Thanks.
Hi Felix,
I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.
I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.
It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.
Thanks,
Sterling
Felix Yang
2014-01-09 23:51:40 UTC
Permalink
Hi Sterling,

Please note that version 2 of the patch is for gcc trunk, not for
gcc-4.8 branch.
Since the doloop_end pattern format has changed, this patch need
small adaptation in order for it to work on gcc-4.8.
Although I test it on gcc-4.8, I think the testing result still
holds for trunk.
Cheers,
Felix
Post by Felix Yang
Hi Sterling,
Attached please find version 2 of the patch.
I applied this updated patch (with small adaptations) to gcc-4.8.2
and carried out some tests.
I can execute the testcases in a simulator, which support
zero-overhead looping instructions.
First of all, I can successfully build libgcc, libstdc++ and
newlibc for xtensa with this patch.
The newly built xtensa gcc also passed testsuite which comes with newlibc.
I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
directory. There are about 800+ cases tested.
Test result shows no new failed case with this patch, compared
with the original gcc version.
Is that OK?
I also double checked the loop relaxation issue with binutils-2.24
(the latest version).
The result show that the assember can do loop relaxation when the
loop target is too far ( > 256 Byte).
And this is the reason why I don't check the size of the loop.
Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog (revision 206463)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,18 @@
+
+ * config/xtensa/xtensa.c (xtensa_reorg): New.
+ (xtensa_reorg_loops): New.
+ (xtensa_can_use_doloop_p): New.
+ (xtensa_invalid_within_doloop): New.
+ (hwloop_optimize): New.
+ (hwloop_fail): New.
+ (hwloop_pattern_reg): New.
+ (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+ (xtensa_doloop_hooks): Define.
+ * config/xtensa/xtensa.md (doloop_end): New.
+ (zero_cost_loop_start): Rewritten.
+ (zero_cost_loop_end): Rewritten.
+
PR tree-optimization/59715
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md (revision 206463)
+++ gcc/config/xtensa/xtensa.md (working copy)
@@ -1,6 +1,7 @@
;; GCC machine description for Tensilica's Xtensa architecture.
;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
;; This file is part of GCC.
@@ -35,6 +36,8 @@
(UNSPEC_TLS_CALL 9)
(UNSPEC_TP 10)
(UNSPEC_MEMW 11)
+ (UNSPEC_LSETUP_START 12)
+ (UNSPEC_LSETUP_END 13)
(UNSPECV_SET_FP 1)
(UNSPECV_ENTRY 2)
@@ -1289,41 +1292,67 @@
(set_attr "length" "3")])
+;; Hardware loop support.
+
;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c). This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop. This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
(define_insn "zero_cost_loop_start"
[(set (pc)
- (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (match_dup 0) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
""
- "loopnez\t%0, %l1"
+ "loop\t%0, %l1_LEND"
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "3")])
(define_insn "zero_cost_loop_end"
[(set (pc)
- (if_then_else (ne (reg:SI 19) (const_int 0))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (reg:SI 19) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
""
{
- xtensa_emit_loop_end (insn, operands);
- return "";
+ xtensa_emit_loop_end (insn, operands);
+ return "";
}
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "0")])
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+ [(parallel [(set (pc) (if_then_else
+ (ne (match_operand:SI 0 "" "")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+ ""
+{
+ /* The loop optimizer doesn't check the predicates... */
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+})
+
;; Setting a register from a comparison.
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c (revision 206463)
+++ gcc/config/xtensa/xtensa.c (working copy)
@@ -1,6 +1,7 @@
/* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
Copyright (C) 2001-2014 Free Software Foundation, Inc.
This file is part of GCC.
@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3. If not see
#include "gimple.h"
#include "gimplify.h"
#include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"
-
/* Enumeration for all of the relational tests, so that we can build
arrays indexed by the test type, and not worry about the order
of EQ, NE, etc. */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
static bool constantpool_address_p (const_rtx addr);
static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+ unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);
static bool xtensa_member_type_forces_blk (const_tree,
enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
struct gcc_target targetm = TARGET_INITIALIZER;
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
}
}
- output_asm_insn ("# loop end for %0", operands);
+ output_asm_insn ("%1_LEND:", operands);
}
@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
return !xtensa_tls_referenced_p (x);
}
+/* Implement TARGET_CAN_USE_DOLOOP_P. */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+ unsigned int level, bool entered_at_top)
+{
+ /* Considering limitations in the hardware, only use doloop for
innermost loops
+ which must be entered from the top. */
+ if (level != 1 || !entered_at_top)
+ return false;
+
+ return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ return NULL;
+}
+
+/* Optimize LOOP. */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+ int i;
+ edge entry_edge;
+ basic_block entry_bb;
+ rtx insn, seq, iter_reg, entry_after;
+
+ if (loop->depth > 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+ return false;
+ }
+
+ if (!loop->incoming_dest)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->incoming_dest != loop->head)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->has_call || loop->has_asm)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+ return false;
+ }
+
+ /* Scan all the blocks to make sure they don't use iter_reg. */
+ if (loop->iter_reg_used || loop->iter_reg_used_outside)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+ return false;
+ }
+
+ /* Check if start_label appears before doloop_end. */
+ insn = loop->start_label;
+ while (insn && insn != loop->loop_end)
+ insn = NEXT_INSN (insn);
+
+ if (!insn)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Get the loop iteration register. */
+ iter_reg = loop->iter_reg;
+
+ gcc_assert (REG_P (iter_reg));
+
+ entry_edge = NULL;
+
+ FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+ if (entry_edge->flags & EDGE_FALLTHRU)
+ break;
+
+ if (entry_edge == NULL)
+ return false;
+
+ /* Place the zero_cost_loop_start instruction before the loop. */
+ entry_bb = entry_edge->src;
+
+ start_sequence ();
+
+ insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+ loop->start_label,
+ loop->iter_reg));
+
+ seq = get_insns ();
+
+ if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+ {
+ basic_block new_bb;
+ edge e;
+ edge_iterator ei;
+
+ emit_insn_before (seq, BB_HEAD (loop->head));
+ seq = emit_label_before (gen_label_rtx (), seq);
+
+ new_bb = create_basic_block (seq, insn, entry_bb);
+ FOR_EACH_EDGE (e, ei, loop->incoming)
+ {
+ if (!(e->flags & EDGE_FALLTHRU))
+ redirect_edge_and_branch_force (e, new_bb);
+ else
+ redirect_edge_succ (e, new_bb);
+ }
+ make_edge (new_bb, loop->head, 0);
+ }
+ else
+ {
+ entry_after = BB_END (entry_bb);
+ while (DEBUG_INSN_P (entry_after)
+ || (NOTE_P (entry_after)
+ && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+ entry_after = PREV_INSN (entry_after);
+ emit_insn_after (seq, entry_after);
+ }
+
+ end_sequence ();
+
+ return true;
+}
+
+/* A callback for the hw-doloop pass. Called when a loop we have discovered
+ turns out not to be optimizable; we have to split the loop_end pattern into
+ a subtract and a test. */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+ rtx test, insn = loop->loop_end;
+
+ emit_insn_before (gen_addsi3 (loop->iter_reg,
+ loop->iter_reg,
+ constm1_rtx),
+ loop->loop_end);
+
+ test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+ insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+ loop->iter_reg, const0_rtx,
+ loop->start_label),
+ loop->loop_end);
+
+ JUMP_LABEL (insn) = loop->start_label;
+ LABEL_NUSES (loop->start_label)++;
+ delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass. This function examines INSN; if
+ it is a doloop_end pattern we recognize, return the reg rtx for the
+ loop counter. Otherwise, return NULL_RTX. */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+ rtx reg;
+
+ if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+ return NULL_RTX;
+
+ reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+ if (!REG_P (reg))
+ return NULL_RTX;
+ return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+ hwloop_pattern_reg,
+ hwloop_optimize,
+ hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+ and tries to rewrite the RTL of these loops so that proper Xtensa
+ hardware loops are generated. */
+
+static void
+xtensa_reorg_loops (void)
+{
+ reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */
+
+static void
+xtensa_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
+ compute_bb_for_insn ();
+
+ df_analyze ();
+
+ /* Doloop optimization. */
+ xtensa_reorg_loops ();
+}
+
#include "gt-xtensa.h"
Cheers,
Felix
On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
Post by Sterling Augustine
Post by Felix Yang
Hi Sterling,
This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
If OK for trunk, please apply it for me. Thanks.
Hi Felix,
I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.
I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.
It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.
Thanks,
Sterling
Yangfei (Felix)
2014-01-10 03:48:55 UTC
Permalink
And here is the xtensa configuration tested (include/xtensa-config.h):

#define XCHAL_HAVE_BE 0
#define XCHAL_HAVE_LOOPS 1
Post by Felix Yang
Hi Sterling,
Please note that version 2 of the patch is for gcc trunk, not for
gcc-4.8 branch.
Since the doloop_end pattern format has changed, this patch need small
adaptation in order for it to work on gcc-4.8.
Although I test it on gcc-4.8, I think the testing result still holds for
trunk.
Cheers,
Felix
Post by Felix Yang
Hi Sterling,
Attached please find version 2 of the patch.
I applied this updated patch (with small adaptations) to gcc-4.8.2
and carried out some tests.
I can execute the testcases in a simulator, which support
zero-overhead looping instructions.
First of all, I can successfully build libgcc, libstdc++ and
newlibc for xtensa with this patch.
The newly built xtensa gcc also passed testsuite which comes with
newlibc.
Post by Felix Yang
I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
directory. There are about 800+ cases tested.
Test result shows no new failed case with this patch, compared
with the original gcc version.
Is that OK?
I also double checked the loop relaxation issue with binutils-2.24
(the latest version).
The result show that the assember can do loop relaxation when the
loop target is too far ( > 256 Byte).
And this is the reason why I don't check the size of the loop.
Index: gcc/ChangeLog
================================================================
===
Post by Felix Yang
--- gcc/ChangeLog (revision 206463)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,18 @@
+
+ * config/xtensa/xtensa.c (xtensa_reorg): New.
+ (xtensa_reorg_loops): New.
+ (xtensa_can_use_doloop_p): New.
+ (xtensa_invalid_within_doloop): New.
+ (hwloop_optimize): New.
+ (hwloop_fail): New.
+ (hwloop_pattern_reg): New.
+ (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end
label.
Post by Felix Yang
+ (xtensa_doloop_hooks): Define.
+ * config/xtensa/xtensa.md (doloop_end): New.
+ (zero_cost_loop_start): Rewritten.
+ (zero_cost_loop_end): Rewritten.
+
PR tree-optimization/59715
Index: gcc/config/xtensa/xtensa.md
================================================================
===
Post by Felix Yang
--- gcc/config/xtensa/xtensa.md (revision 206463)
+++ gcc/config/xtensa/xtensa.md (working copy)
@@ -1,6 +1,7 @@
;; GCC machine description for Tensilica's Xtensa architecture.
;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
;; This file is part of GCC.
@@ -35,6 +36,8 @@
(UNSPEC_TLS_CALL 9)
(UNSPEC_TP 10)
(UNSPEC_MEMW 11)
+ (UNSPEC_LSETUP_START 12)
+ (UNSPEC_LSETUP_END 13)
(UNSPECV_SET_FP 1)
(UNSPECV_ENTRY 2)
@@ -1289,41 +1292,67 @@
(set_attr "length" "3")])
+;; Hardware loop support.
+
;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c). This start -;;
template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop. This start template
+generates ;; the loop insn; the end template doesn't generate any
+instructions since ;; loop end is handled in hardware.
(define_insn "zero_cost_loop_start"
[(set (pc)
- (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (match_dup 0) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
""
- "loopnez\t%0, %l1"
+ "loop\t%0, %l1_LEND"
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "3")])
(define_insn "zero_cost_loop_end"
[(set (pc)
- (if_then_else (ne (reg:SI 19) (const_int 0))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (reg:SI 19) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
""
{
- xtensa_emit_loop_end (insn, operands);
- return "";
+ xtensa_emit_loop_end (insn, operands); return "";
}
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "0")])
+; operand 0 is the loop count pseudo register ; operand 1 is the
+label to jump to at the top of the loop (define_expand "doloop_end"
+ [(parallel [(set (pc) (if_then_else
+ (ne (match_operand:SI 0 "" "")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+ ""
+{
+ /* The loop optimizer doesn't check the predicates... */
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+})
+
;; Setting a register from a comparison.
Index: gcc/config/xtensa/xtensa.c
================================================================
===
Post by Felix Yang
--- gcc/config/xtensa/xtensa.c (revision 206463)
+++ gcc/config/xtensa/xtensa.c (working copy)
@@ -1,6 +1,7 @@
/* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
Copyright (C) 2001-2014 Free Software Foundation, Inc.
This file is part of GCC.
@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3. If not see
#include "gimple.h"
#include "gimplify.h"
#include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"
-
/* Enumeration for all of the relational tests, so that we can build
arrays indexed by the test type, and not worry about the order
of EQ, NE, etc. */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
static bool constantpool_address_p (const_rtx addr);
static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int
iterations_max,
Post by Felix Yang
+ unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);
static bool xtensa_member_type_forces_blk (const_tree,
enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
struct gcc_target targetm = TARGET_INITIALIZER;
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
}
}
- output_asm_insn ("# loop end for %0", operands);
+ output_asm_insn ("%1_LEND:", operands);
}
@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum
machine_mode mo
Post by Felix Yang
return !xtensa_tls_referenced_p (x);
}
+/* Implement TARGET_CAN_USE_DOLOOP_P. */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+ unsigned int level, bool entered_at_top)
+{
+ /* Considering limitations in the hardware, only use doloop for
innermost loops
+ which must be entered from the top. */
+ if (level != 1 || !entered_at_top)
+ return false;
+
+ return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ return NULL;
+}
+
+/* Optimize LOOP. */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+ int i;
+ edge entry_edge;
+ basic_block entry_bb;
+ rtx insn, seq, iter_reg, entry_after;
+
+ if (loop->depth > 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+ return false;
+ }
+
+ if (!loop->incoming_dest)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->incoming_dest != loop->head)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+ return false;
+ }
+
+ if (loop->has_call || loop->has_asm)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+ return false;
+ }
+
+ /* Scan all the blocks to make sure they don't use iter_reg. */
+ if (loop->iter_reg_used || loop->iter_reg_used_outside)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+ return false;
+ }
+
+ /* Check if start_label appears before doloop_end. */
+ insn = loop->start_label;
+ while (insn && insn != loop->loop_end)
+ insn = NEXT_INSN (insn);
+
+ if (!insn)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Get the loop iteration register. */
+ iter_reg = loop->iter_reg;
+
+ gcc_assert (REG_P (iter_reg));
+
+ entry_edge = NULL;
+
+ FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+ if (entry_edge->flags & EDGE_FALLTHRU)
+ break;
+
+ if (entry_edge == NULL)
+ return false;
+
+ /* Place the zero_cost_loop_start instruction before the loop. */
+ entry_bb = entry_edge->src;
+
+ start_sequence ();
+
+ insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+ loop->start_label,
+ loop->iter_reg));
+
+ seq = get_insns ();
+
+ if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+ {
+ basic_block new_bb;
+ edge e;
+ edge_iterator ei;
+
+ emit_insn_before (seq, BB_HEAD (loop->head));
+ seq = emit_label_before (gen_label_rtx (), seq);
+
+ new_bb = create_basic_block (seq, insn, entry_bb);
+ FOR_EACH_EDGE (e, ei, loop->incoming)
+ {
+ if (!(e->flags & EDGE_FALLTHRU))
+ redirect_edge_and_branch_force (e, new_bb);
+ else
+ redirect_edge_succ (e, new_bb);
+ }
+ make_edge (new_bb, loop->head, 0);
+ }
+ else
+ {
+ entry_after = BB_END (entry_bb);
+ while (DEBUG_INSN_P (entry_after)
+ || (NOTE_P (entry_after)
+ && NOTE_KIND (entry_after) !=
NOTE_INSN_BASIC_BLOCK))
Post by Felix Yang
+ entry_after = PREV_INSN (entry_after);
+ emit_insn_after (seq, entry_after);
+ }
+
+ end_sequence ();
+
+ return true;
+}
+
+/* A callback for the hw-doloop pass. Called when a loop we have
discovered
Post by Felix Yang
+ turns out not to be optimizable; we have to split the loop_end pattern
into
Post by Felix Yang
+ a subtract and a test. */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+ rtx test, insn = loop->loop_end;
+
+ emit_insn_before (gen_addsi3 (loop->iter_reg,
+ loop->iter_reg,
+ constm1_rtx),
+ loop->loop_end);
+
+ test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+ insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+ loop->iter_reg,
const0_rtx,
Post by Felix Yang
+
loop->start_label),
Post by Felix Yang
+ loop->loop_end);
+
+ JUMP_LABEL (insn) = loop->start_label;
+ LABEL_NUSES (loop->start_label)++;
+ delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass. This function examines INSN; if
+ it is a doloop_end pattern we recognize, return the reg rtx for the
+ loop counter. Otherwise, return NULL_RTX. */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+ rtx reg;
+
+ if (!JUMP_P (insn) || recog_memoized (insn) !=
CODE_FOR_zero_cost_loop_end)
Post by Felix Yang
+ return NULL_RTX;
+
+ reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+ if (!REG_P (reg))
+ return NULL_RTX;
+ return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+ hwloop_pattern_reg,
+ hwloop_optimize,
+ hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end
insns
Post by Felix Yang
+ and tries to rewrite the RTL of these loops so that proper Xtensa
+ hardware loops are generated. */
+
+static void
+xtensa_reorg_loops (void)
+{
+ reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */
+
+static void
+xtensa_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now.
*/
Post by Felix Yang
+ compute_bb_for_insn ();
+
+ df_analyze ();
+
+ /* Doloop optimization. */
+ xtensa_reorg_loops ();
+}
+
#include "gt-xtensa.h"
Cheers,
Felix
On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
Post by Sterling Augustine
Post by Felix Yang
Hi Sterling,
This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
If OK for trunk, please apply it for me. Thanks.
Hi Felix,
I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.
I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.
It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.
Thanks,
Sterling Augustine
2014-01-13 17:23:55 UTC
Permalink
#define XCHAL_HAVE_BE 0
#define XCHAL_HAVE_LOOPS 1
Hi Felix,

I like this patch, and expect I will approve it. However, I would like
you to do two more things before I do:

1. Ensure it doesn't generate zcl's when:

#define XCHAL_HAVE_LOOPS 0

2. Ensure it doesn't produce loops bodies that contain ret, retw,
ret.n or retw.n as the last instruction. It might be easier to just
disallow them in loop bodies entirely though.

Thanks!
Felix Yang
2014-10-09 10:52:08 UTC
Permalink
Hello Sterling,

My paper work with the FSF has finished and we can now move
forward with this patch :-)
I rebased the patch on the latest trunk. Attached please find
version 3 of the patch.
And the enclosed patch also includes the two points pointed by
you, do you like it?
Make check regression tested with xtensa-elf-gcc built from trunk
with this patch.
OK to apply?

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog (revision 216036)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,19 @@
+2014-10-09 Felix Yang <***@huawei.com>
+
+ * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
+ * config/xtensa/xtensa.c (xtensa_reorg): New.
+ (xtensa_reorg_loops): New.
+ (xtensa_can_use_doloop_p): New.
+ (xtensa_invalid_within_doloop): New.
+ (hwloop_optimize): New.
+ (hwloop_fail): New.
+ (hwloop_pattern_reg): New.
+ (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+ (xtensa_doloop_hooks): Define.
+ * config/xtensa/xtensa.md (doloop_end): New.
+ (zero_cost_loop_start): Rewritten.
+ (zero_cost_loop_end): Rewritten.
+
2014-10-09 Joern Rennecke <***@embecosm.com>

* config/avr/avr.opt (mmcu=): Change to have a string value.
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md (revision 216036)
+++ gcc/config/xtensa/xtensa.md (working copy)
@@ -35,6 +35,8 @@
(UNSPEC_TLS_CALL 9)
(UNSPEC_TP 10)
(UNSPEC_MEMW 11)
+ (UNSPEC_LSETUP_START 12)
+ (UNSPEC_LSETUP_END 13)

(UNSPECV_SET_FP 1)
(UNSPECV_ENTRY 2)
@@ -1289,41 +1291,67 @@
(set_attr "length" "3")])


+;; Zero-overhead looping support.
+
;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c). This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop. This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.

(define_insn "zero_cost_loop_start"
[(set (pc)
- (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (match_dup 0) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
""
- "loopnez\t%0, %l1"
+ "loop\t%0, %l1_LEND"
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "3")])

(define_insn "zero_cost_loop_end"
[(set (pc)
- (if_then_else (ne (reg:SI 19) (const_int 0))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI 19)
- (plus:SI (reg:SI 19) (const_int -1)))]
+ (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_operand:SI 2 "register_operand" "+a0")
+ (plus (match_dup 2)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
""
{
- xtensa_emit_loop_end (insn, operands);
- return "";
+ xtensa_emit_loop_end (insn, operands);
+ return "";
}
[(set_attr "type" "jump")
(set_attr "mode" "none")
(set_attr "length" "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+ [(parallel [(set (pc) (if_then_else
+ (ne (match_operand:SI 0 "" "")
+ (const_int 1))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0)
+ (const_int -1)))
+ (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+ ""
+{
+ /* The loop optimizer doesn't check the predicates... */
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+})
+

;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c (revision 216036)
+++ gcc/config/xtensa/xtensa.c (working copy)
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. If not see
#include "gimplify.h"
#include "df.h"
#include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"


/* Enumeration for all of the relational tests, so that we can build
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

static bool constantpool_address_p (const_rtx addr);
static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+ unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);

static bool xtensa_member_type_forces_blk (const_tree,
enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
}
}

- output_asm_insn ("# loop end for %0", operands);
+ output_asm_insn ("%1_LEND:", operands);
}


@@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum machine_mode mo
return !xtensa_tls_referenced_p (x);
}

+/* Implement TARGET_CAN_USE_DOLOOP_P. */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+ unsigned int loop_depth, bool entered_at_top)
+{
+ if (!TARGET_LOOPS)
+ return false;
+
+ /* Considering limitations in the hardware, only use doloop
+ for innermost loops which must be entered from the top. */
+ if (loop_depth > 1 || !entered_at_top)
+ return false;
+
+ return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+ return "Return from a call instruction in the loop.";
+
+ return NULL;
+}
+
+/* Optimize LOOP. */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+ int i;
+ edge entry_edge;
+ basic_block entry_bb;
+ rtx iter_reg;
+ rtx_insn *insn, *seq, *entry_after;
+
+ if (loop->depth > 1)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not innermost\n",
+ loop->loop_no);
+ return false;
+ }
+
+ if (!loop->incoming_dest)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has more than one entry\n",
+ loop->loop_no);
+ return false;
+ }
+
+ if (loop->incoming_dest != loop->head)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d is not entered from head\n",
+ loop->loop_no);
+ return false;
+ }
+
+ if (loop->has_call || loop->has_asm)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d has invalid insn\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Scan all the blocks to make sure they don't use iter_reg. */
+ if (loop->iter_reg_used || loop->iter_reg_used_outside)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d uses iterator\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Check if start_label appears before doloop_end. */
+ insn = loop->start_label;
+ while (insn && insn != loop->loop_end)
+ insn = NEXT_INSN (insn);
+
+ if (!insn)
+ {
+ if (dump_file)
+ fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+ loop->loop_no);
+ return false;
+ }
+
+ /* Get the loop iteration register. */
+ iter_reg = loop->iter_reg;
+
+ gcc_assert (REG_P (iter_reg));
+
+ entry_edge = NULL;
+
+ FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+ if (entry_edge->flags & EDGE_FALLTHRU)
+ break;
+
+ if (entry_edge == NULL)
+ return false;
+
+ /* Place the zero_cost_loop_start instruction before the loop. */
+ entry_bb = entry_edge->src;
+
+ start_sequence ();
+
+ insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+ loop->start_label,
+ loop->iter_reg));
+
+ seq = get_insns ();
+
+ if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+ {
+ basic_block new_bb;
+ edge e;
+ edge_iterator ei;
+
+ emit_insn_before (seq, BB_HEAD (loop->head));
+ seq = emit_label_before (gen_label_rtx (), seq);
+ new_bb = create_basic_block (seq, insn, entry_bb);
+ FOR_EACH_EDGE (e, ei, loop->incoming)
+ {
+ if (!(e->flags & EDGE_FALLTHRU))
+ redirect_edge_and_branch_force (e, new_bb);
+ else
+ redirect_edge_succ (e, new_bb);
+ }
+
+ make_edge (new_bb, loop->head, 0);
+ }
+ else
+ {
+ entry_after = BB_END (entry_bb);
+ while (DEBUG_INSN_P (entry_after)
+ || (NOTE_P (entry_after)
+ && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+ entry_after = PREV_INSN (entry_after);
+
+ emit_insn_after (seq, entry_after);
+ }
+
+ end_sequence ();
+
+ return true;
+}
+
+/* A callback for the hw-doloop pass. Called when a loop we have discovered
+ turns out not to be optimizable; we have to split the loop_end pattern into
+ a subtract and a test. */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+ rtx test;
+ rtx_insn *insn = loop->loop_end;
+
+ emit_insn_before (gen_addsi3 (loop->iter_reg,
+ loop->iter_reg,
+ constm1_rtx),
+ loop->loop_end);
+
+ test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+ insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+ loop->iter_reg, const0_rtx,
+ loop->start_label),
+ loop->loop_end);
+
+ JUMP_LABEL (insn) = loop->start_label;
+ LABEL_NUSES (loop->start_label)++;
+ delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass. This function examines INSN; if
+ it is a doloop_end pattern we recognize, return the reg rtx for the
+ loop counter. Otherwise, return NULL_RTX. */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+ rtx reg;
+
+ if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+ return NULL_RTX;
+
+ reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+ if (!REG_P (reg))
+ return NULL_RTX;
+
+ return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+ hwloop_pattern_reg,
+ hwloop_optimize,
+ hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+ and tries to rewrite the RTL of these loops so that proper Xtensa
+ hardware loops are generated. */
+
+static void
+xtensa_reorg_loops (void)
+{
+ reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */
+
+static void
+xtensa_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
+ compute_bb_for_insn ();
+
+ df_analyze ();
+
+ /* Doloop optimization. */
+ xtensa_reorg_loops ();
+}
+
#include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h (revision 216036)
+++ gcc/config/xtensa/xtensa.h (working copy)
@@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
#define TARGET_S32C1I XCHAL_HAVE_S32C1I
#define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
#define TARGET_THREADPTR XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS XCHAL_HAVE_LOOPS

#define TARGET_DEFAULT \
((XCHAL_HAVE_L32R ? 0 : MASK_CONST16) | \
Cheers,
Felix


On Tue, Jan 14, 2014 at 1:23 AM, Sterling Augustine
Post by Sterling Augustine
#define XCHAL_HAVE_BE 0
#define XCHAL_HAVE_LOOPS 1
Hi Felix,
I like this patch, and expect I will approve it. However, I would like
#define XCHAL_HAVE_LOOPS 0
2. Ensure it doesn't produce loops bodies that contain ret, retw,
ret.n or retw.n as the last instruction. It might be easier to just
disallow them in loop bodies entirely though.
Thanks!
Loading...