IRIX Network Forums
Bare-bones line-input/tokenization routines - Printable Version

+- IRIX Network Forums (//forums.irixnet.org)
+-- Forum: SGI/MIPS (//forums.irixnet.org/forum-3.html)
+--- Forum: Development/Porting (//forums.irixnet.org/forum-9.html)
+--- Thread: Bare-bones line-input/tokenization routines (/thread-3154.html)



Bare-bones line-input/tokenization routines - commodorejohn - 10-05-2021

Not strictly IRIX-specific, but I'm developing a simulator for a CPU architecture I'm working on, and I wanted to be able to run the thing with minimal modification or porting effort on pretty much any post-'80s *nix environment, because why not. As such, I needed a basic line-input feature and simple tokenization, but didn't want to go to the trouble of bringing in readline or libedit and having to worry about how readily those were available. And since I went to the trouble of rolling this up, I figure I might as well share it. I think (not being a rules lawyer) this is all pretty ANSI-friendly and strictly libc:

Code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h> /* Is this C99? Might need to #define true/false if so. */

/* Simple libc-only routine to get a string from stdin without using gets()
   Certifiably *not* optimized for performance and 100% feature-free. */
char * getLine() {
    typedef struct {
        void * next; /* I have no patience for forward declaration here */
        char block[248];
    } blockList;
    
    char *str, *ctmp;
    blockList *head, *tail, *tmp;
    int blockCount, i, c;
    
    head = malloc(sizeof(blockList));
    if (!head) return NULL;
    
    head->next = NULL;
    tail = head;
    blockCount = 0;
    i = 0;
    
    do {
        c = getchar();
        if (c == EOF || c == 0 || c == 13) continue;
        /* putchar(c); */
        
        /* Windows & *nix line-breaks should process correctly; oldschool Mac
           CR-only isn't supported, since OSX Terminal operates *nix-style. */
        if (c == 10) {
            tail->block[i] = 0;
            break;
        }
        
        if (c == 8 || c == 127) {
            /* Okay, 99% feature-free - I'd go insane if it didn't at least
               support backspace/delete. */
            i--;
            if (i < 0) {
                /* We backspaced over a block boundary; roll back to the prior
                   block and free the current one. */
                tmp = head;
                while (tmp->next != tail) tmp = tmp->next;
                tmp->next = NULL;
                free(tail);
                tail = tmp;
                i = 246;
            }
            continue;
        }
        
        tail->block[i++] = c;
        if (i == 247) {
            tail->block[247] = 0;
            tmp = malloc(sizeof(blockList));
            if (tmp == NULL) {
                /* Out of memory while reading a line!? */
                tmp = head;
                do {
                    tail = tmp;
                    tmp = tmp->next;
                    free(tail);
                } while (tmp);
                return NULL;
            } else {
                tail->next = tmp;
                tail = tmp;
                i = 0;
            }
            blockCount++;
        }
    } while (true);
    
    /* Concatenate all the blocks into a final string. */
    str = malloc((blockCount * 247) + i + 1);
    if (!str) return NULL;
    
    str[0] = 0;
    tmp = head;
    ctmp = str;
    
    do {
        strcat(ctmp, &tmp->block);
        ctmp = &ctmp[247]; /* Okay, *slightly* optimized for performance. */
        tail = tmp;
        tmp = tmp->next;
        if (tail != head) free(tail);
    } while (tmp != NULL);
    
    return str;
}

And the tokenization routine:

Code:
/* Split a string at whitespace delimiters - where "whitespace" is any non-
   alphanumeric/punctuation character. Scans from the start of the string 'til
   a non-delimiter is found, then continues until a delimiter is reached.
   Returns a "token" string containing all non-delimiters in between. Sets
   *next to the address of the closing delimiter. Returns NULL if no non-
   delimiter characters are found before the end of the string. */
char * getToken(char * str, char ** next) {
    char *start, *tmp;
    int i, l;
    
    tmp = str;
    l = 0;
    
    while ((tmp[0] <= 32 || tmp[0] >= 127) && tmp[0]) tmp = &tmp[1];
    if (tmp[0] == 0) return NULL;
    
    start = tmp;
    while (tmp[0] > 32 && tmp[0] < 127) {
        tmp = &tmp[1];
        l++;
    }
    
    tmp = malloc(l + 1);
    if (!tmp) return NULL;
    tmp[l] = 0;
    
    for (i = 0; i < l; i++) tmp[i] = start[i];
    
    *next = &start[l];
    return tmp;
}

Plus a simple test routine:

Code:
int main() {
    char *str, *tmp, *tok;
    while (true) {
        str = getLine();
        tmp = str;
        do {
            tok = getToken(tmp, &tmp);
            if (tok) puts(tok);
            free(tok);
        } while (tok != NULL);
        free(str);
    }
}

This is all released under the "do whatever" license ;)


RE: Bare-bones line-input/tokenization routines - Raion - 10-05-2021

Oh shit, nice stuff to see dude.