0xDE5

Trying to solve and expand 0xde5 exercises in assembly (or C)
git clone git://git.mdnr.space/0xDE5
Log | Files | Refs | README | LICENSE

wcx64e.s (12525B)


      1 #-------------------------------------------------------------------------------
      2 # wcx64: a simplistic wc clone in x64 assembly. Usage:
      3 #
      4 # $ wcx64 file1 /path/file2 file3
      5 #
      6 # When not given any command-line arguments, reads from stdin.
      7 # Always prints the all three counters: line, word, byte.
      8 #
      9 # Eli Bendersky (eliben@gmail.com)
     10 # This code is in the public domain
     11 #-------------------------------------------------------------------------------
     12 
     13 #------------- CONSTANTS --------------#
     14 .set READ_SYSCALL, 0
     15 .set WRITE_SYSCALL, 1
     16 .set OPEN_SYSCALL, 2
     17 .set CLOSE_SYSCALL, 3
     18 .set EXIT_SYSCALL, 60
     19 .set STDIN_FD, 0
     20 .set STDOUT_FD, 1
     21 
     22 .set O_RDONLY, 0x0
     23 .set OPEN_NO_MODE, 0x0
     24 .set READBUFLEN, 16384
     25 .set ITOABUFLEN, 12
     26 .set NEWLINE, '\n'
     27 .set CR, '\r'
     28 .set TAB, '\t'
     29 .set SPACE, ' '
     30 .set DIGIT_ZERO, '0'
     31 .set DIGIT_NINE, '9'
     32 
     33 #---------------- DATA ----------------#
     34     .data
     35 
     36 newline_str:
     37     .asciz "\n"
     38 
     39 fourspace_str:
     40     .asciz "    "
     41 
     42 total_str:
     43     .asciz "total"
     44 
     45 windows_style_label:
     46     .asciz " [windows]"
     47 
     48 unix_style_label:
     49     .asciz " [unix]"
     50 
     51 style_flag: # 0: unix, 1: windows
     52     .space 1, 0x0 # default is unix
     53 
     54 integer_flag: # 0: false, 1: true
     55     .space 1, 0x0
     56 
     57 total_integer_counter:
     58     .space 8, 0x0
     59 
     60 file_integer_counter:
     61     .space 8, 0x0
     62 
     63 buf_for_read:
     64     # leave space for terminating 0
     65     .space READBUFLEN + 1, 0x0
     66 
     67     # The itoa buffer here is large enough to hold just 11 digits (plus one
     68     # byte for the terminating null). For the wc counters this is enough
     69     # because it lets us represent 10-digit numbers (up to 10 GB)
     70     # with spaces in between.
     71     # Note: this is an artificial limitation for simplicity in printing out the
     72     # counters; this size can be easily increased.
     73 buf_for_itoa:
     74     .space ITOABUFLEN, 0x0
     75     .set   endbuf_for_itoa, buf_for_itoa + ITOABUFLEN - 1
     76 
     77 #---------------- "MAIN" CODE ----------------#
     78     .globl _start
     79     .text
     80 
     81 _start:
     82     # If there are no argv, go to .L_no_argv for reading from
     83     # stdin.
     84     mov (%rsp), %rbx                # (%rsp) is argc
     85     cmp $1, %rbx
     86     jle .L_no_argv
     87 
     88     xor %r13, %r13
     89     xor %r14, %r14
     90     xor %r15, %r15
     91 
     92     # In a loop, argv[n] for 1 <= n < argc; rbp holds n.
     93     mov $1, %rbp
     94 
     95 .L_argv_loop:
     96     # Throughout the loop, register assignments:
     97     # r12: argv[n]. Also gets into rdi for passing into the open() syscall
     98     # rbp: argv counter n
     99     # rbx: holds argc
    100     # r13, r14, r15: total numbers counted in all files.
    101     mov 8(%rsp, %rbp, 8), %rdi      # argv[n] is in (rsp + 8 + 8*n)
    102     mov %rdi, %r12
    103 
    104     # Call open(argv[n], O_RDONLY).
    105     mov $O_RDONLY, %rsi
    106     mov $OPEN_NO_MODE, %rdx
    107     mov $OPEN_SYSCALL, %rax
    108     syscall
    109 
    110     # Ignore files that can't be opened
    111     cmp  $0, %rax
    112     jl   .L_next_argv
    113     push %rax                       # save fd on the stack
    114 
    115     mov  %rax, %rdi
    116     call count_in_file
    117 
    118     # Add the counters returned from count_in_file to the totals and pass
    119     # them to print_counters.
    120     mov  %rax, %rdi
    121     add  %rax, %r13
    122     mov  %rdx, %rsi
    123     add  %rdx, %r14
    124     mov  %r9, %rdx
    125     add  %r9, %r15
    126     mov file_integer_counter, %r11
    127     add %r11, total_integer_counter
    128     mov  %r12, %rcx
    129     call print_counters
    130     movb $0, style_flag # reset the newline label
    131 
    132     # Call close(argv[n])
    133     pop %rdi                        # restore fd from the stack
    134     mov $CLOSE_SYSCALL, %rax
    135     syscall
    136 
    137 .L_next_argv:
    138     inc %rbp
    139     cmp %rbx, %rbp
    140     jl  .L_argv_loop
    141 
    142     # Done with all argv. Now print out the totals.
    143     mov  %r13, %rdi
    144     mov  %r14, %rsi
    145     mov  %r15, %rdx
    146     mov  total_integer_counter, %r11
    147     lea  total_str, %rcx
    148     call print_counters
    149 
    150     jmp .L_wcx64_exit
    151 
    152 .L_no_argv:
    153     # Read from stdin, which is file descriptor 0.
    154     mov  $STDIN_FD, %rdi
    155     call count_in_file
    156 
    157     # Print the counters without a name string
    158     mov  %rax, %rdi
    159     mov  %rdx, %rsi
    160     mov  %r9, %rdx
    161     mov  $0, %rcx
    162     call print_counters
    163 
    164 .L_wcx64_exit:
    165     # exit(0)
    166     mov $0, %rdi
    167     mov $EXIT_SYSCALL, %rax
    168     syscall
    169     ret
    170 
    171 #---------------- FUNCTIONS ----------------#
    172 
    173 # Function count_in_file
    174 # Counts chars, words and lines for a single file.
    175 #
    176 # Arguments:
    177 # rdi     file descriptor representing an open file.
    178 #
    179 # Returns:
    180 # rax                   line count
    181 # rdx                   word count
    182 # r9                    char count
    183 # file_integer_counter  int count
    184 count_in_file:
    185     # Save callee-saved registers.
    186     push %r12
    187     push %r13
    188     push %r14
    189     push %r15
    190 
    191     # Register usage within the function:
    192     #
    193     # rdi: holds the fd
    194     # r9: char counter
    195     # r15: word counter
    196     # r14: line counter
    197     # r13: address of the read buffer
    198     # rcx: loop index for going over a read buffer
    199     # dl: next byte read from the buffer
    200     # r12: state indicator, with the states defined below.
    201     # the word counter is incremented when we switch from
    202     # IN_WHITESPACE to IN_WORD.
    203     .set IN_WORD, 1
    204     .set IN_WHITESPACE, 2
    205 
    206     # In addition, rsi, rdx, rax are used in the call to read().
    207     # After each call to read(), rax is used for its return value.
    208     xor %r9, %r9
    209     xor %r15, %r15
    210     xor %r14, %r14
    211     movq $0, file_integer_counter
    212     lea buf_for_read, %r13
    213     mov $IN_WHITESPACE, %r12
    214 
    215 .L_read_buf:
    216     # Call read(fd, buf_for_read, READBUFLEN). rdi already contains fd
    217     mov %r13, %rsi
    218     mov $READBUFLEN, %rdx
    219     mov $READ_SYSCALL, %rax
    220     syscall
    221 
    222     # From here on, rax holds the number of bytes actually read from the
    223     # file (the return value of read())
    224     add %rax, %r9                       # Update the char counter
    225 
    226     cmp $0, %rax                        # No bytes read?
    227     je  .L_done_with_file
    228     mov $1, %r14                        # to count visual lines
    229 
    230     xor %rcx, %rcx
    231 .L_next_byte_in_buf:
    232     movb (%r13, %rcx, 1), %dl           # Read the byte
    233 
    234     # See what we've got and jump to the appropriate label.
    235     cmp $NEWLINE, %dl
    236     je  .L_seen_newline
    237     cmp $CR, %dl
    238     je  .L_seen_whitespace_not_newline
    239     cmp $SPACE, %dl
    240     je  .L_seen_whitespace_not_newline
    241     cmp $TAB, %dl
    242     je  .L_seen_whitespace_not_newline
    243     # else, it's not whitespace but a part of a word.
    244 
    245     # If we're in a word already, nothing else to do.
    246     cmp $IN_WORD, %r12
    247     je  .L_done_with_this_byte
    248     # else, transition from IN_WHITESPACE to IN_WORD: increment the word
    249     # counter.
    250     inc %r15
    251     mov $IN_WORD, %r12
    252     # start of the word: if it's a digit, set the digit flag
    253     cmp $DIGIT_ZERO, %dl
    254     jb .L_done_with_this_byte
    255     cmp $DIGIT_NINE, %dl
    256     ja .L_done_with_this_byte
    257     movb $1, integer_flag
    258 
    259     jmp .L_done_with_this_byte
    260 
    261 .L_seen_newline:
    262     # Increment the line counter and fall through.
    263     inc %r14
    264     dec %rcx
    265     cmpb $CR, (%r13, %rcx, 1)
    266     jne .L_not_windows_style_newline
    267     movb $1, style_flag
    268 .L_not_windows_style_newline:
    269     inc %rcx
    270 
    271 .L_seen_whitespace_not_newline:
    272     cmp $IN_WORD, %r12
    273     je  .L_end_current_word
    274     # Otherwise, still in whitespace.
    275     jmp .L_done_with_this_byte
    276 
    277 .L_end_current_word:
    278     # if integer flag is set, increase the integer counter
    279     cmpb $1, integer_flag
    280     jne .L_not_integer
    281     addq $1, file_integer_counter
    282 .L_not_integer:
    283     mov $IN_WHITESPACE, %r12
    284 
    285 .L_done_with_this_byte:
    286     cmp $DIGIT_ZERO, %dl
    287     jae .L_check_upper_limit
    288     movb $0, integer_flag
    289 .L_check_upper_limit:
    290     cmp $DIGIT_NINE, %dl
    291     jbe .L_advance_read_pointer
    292     movb $0, integer_flag
    293 .L_advance_read_pointer:
    294     # Advance read pointer and check if we haven't finished with the read
    295     # buffer yet.
    296     inc %rcx
    297     cmp %rcx, %rax
    298     jg  .L_next_byte_in_buf
    299 
    300     # Done going over this buffer. We need to read another buffer
    301     # if rax == READBUFLEN.
    302     cmp $READBUFLEN, %rax
    303     je  .L_read_buf
    304 
    305 .L_done_with_file:
    306     # Done with this file. The char count is already in r9.
    307     # Put the word and line counts in their return locations.
    308     cmp $NEWLINE, %dl
    309     jne .L_dont_decrease
    310     dec %r14
    311 .L_dont_decrease:
    312     mov %r15, %rdx
    313     mov %r14, %rax
    314 
    315     # Restore callee-saved registers.
    316     pop %r15
    317     pop %r14
    318     pop %r13
    319     pop %r12
    320     ret
    321 
    322 # Function print_cstring
    323 # Print a null-terminated string to stdout.
    324 #
    325 # Arguments:
    326 # rdi     address of string
    327 #
    328 # Returns: void
    329 print_cstring:
    330     # Find the terminating null
    331     mov %rdi, %r10
    332 .L_find_null:
    333     cmpb $0, (%r10)
    334     je   .L_end_find_null
    335     inc  %r10
    336     jmp  .L_find_null
    337 
    338 .L_end_find_null:
    339     # r10 points to the terminating null. so r10-rdi is the length
    340     sub %rdi, %r10
    341     # Now that we have the length, we can call sys_write
    342     # sys_write(unsigned fd, char* buf, size_t count)
    343     mov $WRITE_SYSCALL, %rax
    344     # Populate address of string into rsi first, because the later
    345     # assignment of fd clobbers rdi.
    346     mov %rdi, %rsi
    347     mov $STDOUT_FD, %rdi
    348     mov %r10, %rdx
    349     syscall
    350     ret
    351 
    352 # Function print_counters
    353 # Print three counters with an optional name to stdout.
    354 #
    355 # Arguments:
    356 # rdi, rsi, rdx, r11:   the counters
    357 # rcx:             address of the name C-string. If 0, no name is printed.
    358 #
    359 # Returns: void
    360 print_counters:
    361     push %r14
    362     push %r15
    363     push %r11
    364     push %rdx
    365     push %rsi
    366     push %rdi
    367     # rcx can be clobbered by callees, so save it in %r14.
    368     mov  %rcx, %r14
    369 
    370     # r15 is the counter pointer, running over 0, 1, 2
    371     # counter N is at (rsp + 8 * r15)
    372     xor %r15, %r15
    373 
    374 .L_print_next_counter:
    375     # Fill the itoa buffer with spaces.
    376     lea  buf_for_itoa, %rdi
    377     mov  $SPACE, %rsi
    378     mov  $ITOABUFLEN, %rdx
    379     call memset
    380     # Convert the next counter and then call print_cstring with the
    381     # beginning of the itoa buffer - because we want space-prefixed
    382     # output.
    383     mov  (%rsp, %r15, 8), %rdi
    384     lea  endbuf_for_itoa, %rsi
    385     call itoa
    386     lea  buf_for_itoa, %rdi
    387     call print_cstring
    388     inc %r15
    389     cmp $4, %r15
    390     jl  .L_print_next_counter
    391 
    392     # If name address is not 0, print out the given null-terminated string
    393     # as well.
    394     cmp  $0, %r14
    395     je   .L_print_label
    396     lea  fourspace_str, %rdi
    397     call print_cstring
    398     mov  %r14, %rdi
    399     call print_cstring
    400     cmp $total_str, %r14
    401     je .L_print_counters_done
    402 .L_print_label:
    403     lea unix_style_label, %rdi
    404     cmpb $1, style_flag
    405     jne .L_print_unix_label
    406     lea windows_style_label, %rdi
    407 .L_print_unix_label:
    408     call print_cstring
    409 
    410 .L_print_counters_done:
    411     lea  newline_str, %rdi
    412     call print_cstring
    413     pop  %rdi
    414     pop  %rsi
    415     pop  %rdx
    416     pop  %r11
    417     pop  %r15
    418     pop  %r14
    419     ret
    420 
    421 # Function memset
    422 # Fill memory with some byte
    423 #
    424 # Arguments:
    425 # rdi:    pointer to memory
    426 # rsi:    fill byte (in the low 8 bits)
    427 # rdx:    how many bytes to fill
    428 #
    429 # Returns: void
    430 memset:
    431     xor %r10, %r10
    432 
    433 .L_next_byte:
    434     movb %sil, (%rdi, %r10, 1)          # sil is rsi's low 8 bits
    435     inc  %r10
    436     cmp  %rdx, %r10
    437     jl   .L_next_byte
    438     ret
    439 
    440 # Function itoa
    441 # Convert an integer to a null-terminated string in memory.
    442 # Assumes that there is enough space allocated in the target
    443 # buffer for the representation of the integer. Since the number itself
    444 # is accepted in the register, its value is bounded.
    445 #
    446 # Arguments:
    447 # rdi:    the integer
    448 # rsi:    address of the *last* byte in the target buffer. bytes will be filled
    449 #         starting with this address and proceeding lower until the number
    450 #         runs out.
    451 #
    452 # Returns:
    453 # rax:    address of the first byte in the target string that
    454 #         contains valid information.
    455 itoa:
    456     movb $0, (%rsi)        # Write the terminating null and advance.
    457 
    458     # If the input number is negative, we mark it by placing 1 into r9
    459     # and negate it. In the end we check if r9 is 1 and add a '-' in front.
    460     mov $0, %r9
    461     cmp $0, %rdi
    462     jge .L_input_positive
    463     neg %rdi
    464     mov $1, %r9
    465 
    466 .L_input_positive:
    467 
    468     mov %rdi, %rax          # Place the number into rax for the division.
    469     mov $10, %r8            # The base is in r8
    470 
    471 .L_next_digit:
    472     # Prepare rdx:rax for division by clearing rdx. rax remains from the
    473     # previous div. rax will be rax / 10, rdx will be the next digit to
    474     # write out.
    475     xor %rdx, %rdx
    476     div %r8
    477 
    478     # Write the digit to the buffer, in ascii
    479     dec  %rsi
    480     add  $0x30, %dl
    481     movb %dl, (%rsi)
    482 
    483     cmp $0, %rax            # We're done when the quotient is 0.
    484     jne .L_next_digit
    485 
    486     # If we marked in r9 that the input is negative, it's time to add that
    487     # '-' in front of the output.
    488     cmp  $1, %r9
    489     jne  .L_itoa_done
    490     dec  %rsi
    491     movb $0x2d, (%rsi)
    492 
    493 .L_itoa_done:
    494     mov %rsi, %rax          # rsi points to the first byte now; return it.
    495     ret