Bare-bones line-input/tokenization routines
#1
Bare-bones line-input/tokenization routines
Not strictly IRIX-specific, but I'm developing a simulator for a CPU architecture I'm working on, and I wanted to be able to run the thing with minimal modification or porting effort on pretty much any post-'80s *nix environment, because why not. As such, I needed a basic line-input feature and simple tokenization, but didn't want to go to the trouble of bringing in readline or libedit and having to worry about how readily those were available. And since I went to the trouble of rolling this up, I figure I might as well share it. I think (not being a rules lawyer) this is all pretty ANSI-friendly and strictly libc:

Code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h> /* Is this C99? Might need to #define true/false if so. */

/* Simple libc-only routine to get a string from stdin without using gets()
   Certifiably *not* optimized for performance and 100% feature-free. */
char * getLine() {
    typedef struct {
        void * next; /* I have no patience for forward declaration here */
        char block[248];
    } blockList;
    
    char *str, *ctmp;
    blockList *head, *tail, *tmp;
    int blockCount, i, c;
    
    head = malloc(sizeof(blockList));
    if (!head) return NULL;
    
    head->next = NULL;
    tail = head;
    blockCount = 0;
    i = 0;
    
    do {
        c = getchar();
        if (c == EOF || c == 0 || c == 13) continue;
        /* putchar(c); */
        
        /* Windows & *nix line-breaks should process correctly; oldschool Mac
           CR-only isn't supported, since OSX Terminal operates *nix-style. */
        if (c == 10) {
            tail->block[i] = 0;
            break;
        }
        
        if (c == 8 || c == 127) {
            /* Okay, 99% feature-free - I'd go insane if it didn't at least
               support backspace/delete. */
            i--;
            if (i < 0) {
                /* We backspaced over a block boundary; roll back to the prior
                   block and free the current one. */
                tmp = head;
                while (tmp->next != tail) tmp = tmp->next;
                tmp->next = NULL;
                free(tail);
                tail = tmp;
                i = 246;
            }
            continue;
        }
        
        tail->block[i++] = c;
        if (i == 247) {
            tail->block[247] = 0;
            tmp = malloc(sizeof(blockList));
            if (tmp == NULL) {
                /* Out of memory while reading a line!? */
                tmp = head;
                do {
                    tail = tmp;
                    tmp = tmp->next;
                    free(tail);
                } while (tmp);
                return NULL;
            } else {
                tail->next = tmp;
                tail = tmp;
                i = 0;
            }
            blockCount++;
        }
    } while (true);
    
    /* Concatenate all the blocks into a final string. */
    str = malloc((blockCount * 247) + i + 1);
    if (!str) return NULL;
    
    str[0] = 0;
    tmp = head;
    ctmp = str;
    
    do {
        strcat(ctmp, &tmp->block);
        ctmp = &ctmp[247]; /* Okay, *slightly* optimized for performance. */
        tail = tmp;
        tmp = tmp->next;
        if (tail != head) free(tail);
    } while (tmp != NULL);
    
    return str;
}

And the tokenization routine:

Code:
/* Split a string at whitespace delimiters - where "whitespace" is any non-
   alphanumeric/punctuation character. Scans from the start of the string 'til
   a non-delimiter is found, then continues until a delimiter is reached.
   Returns a "token" string containing all non-delimiters in between. Sets
   *next to the address of the closing delimiter. Returns NULL if no non-
   delimiter characters are found before the end of the string. */
char * getToken(char * str, char ** next) {
    char *start, *tmp;
    int i, l;
    
    tmp = str;
    l = 0;
    
    while ((tmp[0] <= 32 || tmp[0] >= 127) && tmp[0]) tmp = &tmp[1];
    if (tmp[0] == 0) return NULL;
    
    start = tmp;
    while (tmp[0] > 32 && tmp[0] < 127) {
        tmp = &tmp[1];
        l++;
    }
    
    tmp = malloc(l + 1);
    if (!tmp) return NULL;
    tmp[l] = 0;
    
    for (i = 0; i < l; i++) tmp[i] = start[i];
    
    *next = &start[l];
    return tmp;
}

Plus a simple test routine:

Code:
int main() {
    char *str, *tmp, *tok;
    while (true) {
        str = getLine();
        tmp = str;
        do {
            tok = getToken(tmp, &tmp);
            if (tok) puts(tok);
            free(tok);
        } while (tok != NULL);
        free(str);
    }
}

This is all released under the "do whatever" license ;)

Computers: Amiga 1200, DEC VAXStation 4000/60, DEC MicroPDP-11/73
Synthesizers: Roland JX-10/SH-09/MT-32/D-50, Yamaha DX7-II/V50/TX7/TG33/FB-01, Korg MS-20 Mini/ARP Odyssey/DW-8000/X5DR, Ensoniq SQ-80, E-mu Proteus/2, Nord Lead 2, Behringer Model D
(This post was last modified: 10-05-2021, 03:36 PM by commodorejohn.)
commodorejohn
PDP-X

Trade Count: (0)
Posts: 367
Threads: 7
Joined: May 2018
Find Reply
10-05-2021, 03:32 PM


Messages In This Thread
Bare-bones line-input/tokenization routines - by commodorejohn - 10-05-2021, 03:32 PM
RE: Bare-bones line-input/tokenization routines - by Raion - 10-05-2021, 06:44 PM

Forum Jump:


Users browsing this thread: 1 Guest(s)