// I have reduced noops at the end of each of these to take into account that there are several additional clock ticks of setup, after. However, I'm not totally sure that I get things right, seeing that there are four possible sequences, and I'm not really accounting for the timing of all four of them. #define write_zero(port, bit) \ __asm__ __volatile__ ( \ "sbi %0, %1" "\n\t" \ "nop" "\n\t" \ "cbi %0, %1" "\n\t" \ "nop" "\n\t" \ : /* no outputs */ \ : "I" (_SFR_IO_ADDR(port)), \ "I" (bit) \ ) #define write_one(port, bit) \ __asm__ __volatile__ ( \ "sbi %0, %1" "\n\t" \ "nop" "\n\t" \ "nop" "\n\t" \ "nop" "\n\t" \ "nop" "\n\t" \ "nop" "\n\t" \ "cbi %0, %1" "\n\t" \ : /* no outputs */ \ : "I" (_SFR_IO_ADDR(port)), \ "I" (bit) \ ) #define write_byte(port, bit, byte) \ __asm__ __volatile__ ( \ "ldi %z, 8" "\n\t" \ // count out eight bits "ld __tmp_reg__, %[byte]" "\n\t" \ // load the current byte into a temporary register "L_%=: " "lsl __tmp_reg__" "\n\t" \ // shift the temporary register left, saving the msb in SREG (1 cycle) "brbs I_%=" "\n\t" \ // if SREG is set, branch to I_%= (2 cycles if true, 1 cycle if false) write_zero(port, bit) \ // SREG was zero, so write a zero to the port "rjmp J_%=" "\n\t" \ // Jump to J_%=, the loop cleanup (2 cycles) "I_%=: " write_one(port, bit) \ // SREG was one, so write a one to the port "J_%=: " "dec %z" "\n\t" \ // Decrement the bits counter (1 cycle) "cpi %z, 0" "\n\t" \ // are there any bits left to send? (1 cycle) "brne L_%=" "\n\t" \ // there are, so go back to L_%= (2 cycles) : /* no outputs */ \ : [byte] "I" (byte) \ )