!----------------------------------------------------------------------- ! Run 32 threads on four cores with DMA on thread 5. !----------------------------------------------------------------------- TSOTOOL.PROCESSOR niagara2.rtl TSOTOOL.MODE GEN TSOTOOL.N_THREADS 32 TSOTOOL.TEST_NAME fc4_prop_diag TSOTOOL.BATCH Y ! ! It appears that DMA generating threads are about 18% efficient. ! In other words, if all threads try to execute the same number ! of 'instructions' the non-DMA threads will finish long before ! the DMA threads. The sollowing weighting seems to work for an ! FC1 model with one thread doing DMA. ! ! 0 0 1 2 3 4 5 6 7 1 0 1 2 3 4 5 6 7 2 0 1 2 3 4 5 6 7 3 0 1 2 3 4 5 6 7 GEN.N_INSTR_PER_THREAD 100, 100, 100, 100, 100, 19, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 GEN.AVG_LOOP_SIZE 0 GEN.AVG_LOOP_ITER 0 ! GEN.SEED 499406026 ADMAP.RTL.REGION_PA_SEPARATION 8M ADMAP.N_REGIONS 4 ADMAP.REGION_SIZE 64K ADMAP.REGION_OFFSETS 916-920-984-1012, 180-1416-2228-3228, 316-692-1268-1520-1560-2404, 2108-2132-2156-2304-2412-2544-2556 ADMAP.ATTRIBUTES CV=1110,CP=1110 ADMAP.NC_MEMMAP 0xc800002000:8G ADMAP.N_ALIASES 0 ADMAP.ALIAS_FREQUENCY 64 ADMAP.ALIAS_OFFSET 8388608 RUN.SIMULATOR no_simulator_defined WT.PCT_FP_INSTR 10 WT.PCT_LITTLE_ENDIAN 5 WT.PCT_LOADS_NF 0 WT.PCT_NFS_FAULT 0 WT.PCT_PREFETCH_FAULT 20 WT.PCT_PREFETCH_UNIMP 20 WT.PCT_CBRANCH 0 WT.PCT_SECONDARY_CTX 0 WT.PCT_NUCLEUS_CTX 0 WT.REPLACEMENT 5 0 cpu=5 WT.INTERRUPT 0 WT.LD 10 0 cpu=5 WT.BLD 0 5 cpu=0-4,6-31 region=0-2 WT.DWLD 0 1 cpu=0-4,6-31 region=0-2 WT.LDD 0 WT.QWLD 0 WT.AQLD 0 1 cpu=0-4,6-31 region=1 WT.ST 5 0 cpu=5 WT.BST 0 10 cpu=0-4,6-31 region=0-2 WT.BSTC 0 WT.ST_BINIT 0 WT.DWST_BINIT 0 WT.DWST 0 10 cpu=0-4,6-31 region=0-2 WT.QWST 0 WT.SWAP 0 5 cpu=0-4,6-31 region=0-2 WT.CAS 0 5 cpu=0-4,6-31 region=0-2 WT.CASX 0 5 cpu=0-4,6-31 region=0-2 WT.ASI_L2_FLUSH 0 WT.FLUSHI 0 WT.MEMBAR 0 WT.PREFETCH 10 0 cpu=5 WT.NOP 0 ADV.L2_WAYS 16 ADV.RESULTS_TO_MEM N ADV.BST_MEMBARS Y ADV.BLD_MEMBARS Y ADV.FP_FLUSH_MEMBARS Y ADV.CAS_IMPLICIT_MEMBARS Y ADV.WARMUP_ITERATIONS 0 ADV.TEST_REPETITIONS 1 ! PEP based DMA operation weights ! The weights here try to skew for mostly small ! and large DMAs, with the emphasis on large and ! skewed towards writes. WT.MACRO.ALM_DMA0_RD 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA1_RD 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA2_RD 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA3_RD 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x8 0 15 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0xC 0 4 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x10 0 4 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x14 0 4 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x20 0 4 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x30 0 4 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x3C 0 30 cpu=5 region=0-2 WT.MACRO.DMA0_RD_0x40 0 10 cpu=5 region=0-2 WT.MACRO.DMA1_RD_0x40 0 10 cpu=5 region=0-2 WT.MACRO.DMA2_RD_0x40 0 10 cpu=5 region=0-2 WT.MACRO.DMA3_RD_0x40 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA0_WR 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA1_WR 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA2_WR 0 10 cpu=5 region=0-2 WT.MACRO.ALM_DMA3_WR 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x8 0 30 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0xC 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x10 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x14 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x20 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x30 0 10 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x3C 0 80 cpu=5 region=0-2 WT.MACRO.DMA0_WR_0x40 0 25 cpu=5 region=0-2 WT.MACRO.DMA1_WR_0x40 0 25 cpu=5 region=0-2 WT.MACRO.DMA2_WR_0x40 0 25 cpu=5 region=0-2 WT.MACRO.DMA3_WR_0x40 0 25 cpu=5 region=0-2 WT.MACRO.ALM_DMA0_INT 0 40 cpu=5 region=0-2