ratatoskr-messenger/zwgc/lexer.c

/* This file is part of the Project Athena Zephyr Notification System.
 * It is one of the source files comprising zwgc, the Zephyr WindowGram
 * client.
 *
 *      Created by:     Marc Horowitz <marc@athena.mit.edu>
 *
 *      $Id$
 *
 *      Copyright (c) 1989 by the Massachusetts Institute of Technology.
 *      For copying and distribution information, see the file
 *      "mit-copyright.h".
 */

#include <sysdep.h>

#if (!defined(lint) && !defined(SABER))
static const char rcsid_lexer_c[] = "$Id$";
#endif

#include <zephyr/mit-copyright.h>

/****************************************************************************/
/*                                                                          */
/*               The lexer for the zwgc description language:               */
/*                                                                          */
/****************************************************************************/

#include "new_memory.h"
#include "new_string.h"
#include "int_dictionary.h"
#include "lexer.h"
#include "parser.h"
#include "y.tab.h"

/*
 * yylineno - this holds the current line # we are on.  Updated automatically
 *            by input() and unput().
 */

int yylineno;

/*
 * keyword_dict - this dictionary maps keyword names to their token numbers.
 */

static int_dictionary keyword_dict = NULL;

/****************************************************************************/
/*                                                                          */
/*                               I/O functions:                             */
/*                                                                          */
/****************************************************************************/

/*
 * input_file - this holds the FILE pointer to the file currently being lexed.
 */

static FILE *input_file;

/*
 * pushback - if not -1, holds a character that was pushed back by unput but
 *            not yet read by input.
 */

static int pushback = -1;

static char
input(void)
{
    int c;

    if (pushback != -1) {
	c = pushback;
	pushback = -1;
	if (c=='\n')
	  yylineno++;
	return(c);
    }

    c = getc(input_file);
    if (c=='\n')
      yylineno++;
    if (c==EOF)
      c = 0;

    return(c);
}

static void
unput(int c)
{
#ifdef DEBUG
    if (pushback != -1) {
	printf("Attempt to push back 2 characters at one time!\n");
	exit(1);
    }
#endif

    pushback = c;
    if (c == '\n')
      yylineno--;
}

/****************************************************************************/
/*                                                                          */
/*                           Initialization routines:                       */
/*                                                                          */
/****************************************************************************/

struct keyword_info {
    string keyword;
    int keyword_number;
};

/*
 * keywords - This table holds a copy of the mapping from keyword name to
 *            token number and is used to initialize keyword_dict:
 */

static struct keyword_info keywords[] =   {
                   { "and", '&' },
		   { "appendport", APPENDPORT },
		   { "buffer", BUFFER },
		   { "break", BREAK },
		   { "closeinput", CLOSEINPUT },
		   { "closeoutput", CLOSEOUTPUT },
		   { "closeport", CLOSEPORT },
		   { "case", CASE },
		   { "clearbuf", CLEARBUF },
		   { "default", DEFAULT },
		   { "do", DO },
		   { "downcase", DOWNCASE },
		   { "else", ELSE },
		   { "elseif", ELSEIF },
		   { "endcase", ENDCASE },
		   { "endif", ENDIF },
		   { "endwhile", ENDWHILE },
		   { "exec", EXEC },
		   { "execport", EXECPORT },
		   { "exit", EXIT },
		   { "fields", FIELDS },
		   { "get", GET },
		   { "getenv", GETENV },
		   { "if", IF },
		   { "inputport", INPUTPORT },
		   { "lany", LANY },
		   { "lbreak", LBREAK },
		   { "lspan", LSPAN },
		   { "match", MATCH },
		   { "noop", NOOP },
		   { "not", '!' },
		   { "or", '|' },
		   { "outputport", OUTPUTPORT },
		   { "print", PRINT },
		   { "protect", PROTECT },
		   { "put", PUT },
		   { "rany", RANY },
		   { "rbreak", RBREAK },
		   { "rspan", RSPAN },
		   { "set", SET },
		   { "show", SHOW },
		   { "stylestrip", STYLESTRIP },
		   { "substitute", SUBSTITUTE },
		   { "then", THEN },
		   { "upcase", UPCASE },
		   { "while", WHILE },
		   { "verbatim", VERBATIM },
		   { "zvar", ZVAR } };

/*
 * lex_open - this routine [re]initializes the lexer & prepares it to lex
 *            a file.  Resets current line # to 1.
 */

void
lex_open(FILE *file)
{
    /*
     * Initialize I/O:
     */
    input_file = file;
    yylineno = 1;
    pushback = -1;

    /*
     * Initialize keyword_dict from keywords if needed:
     */
    if (!keyword_dict) {
	unsigned int i;

	keyword_dict = int_dictionary_Create(101);

	for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
	  int_dictionary_Define(keyword_dict, keywords[i].keyword,
				0)->value = keywords[i].keyword_number;
    }
}

/****************************************************************************/
/*                                                                          */
/*                            lex subroutines:                              */
/*                                                                          */
/****************************************************************************/

/*
 * eat_escape_code - this rountine eats an escape code & returns the character
 *                   it codes for or 0 if it codes for "".
 *                   (an escape code is what follows a '\\' in a quoted
 *                   string)  Current escape codes are:
 *
 *                       "n"          == '\n'
 *                       "t"          == '\t'
 *                       "b"          == '\b'
 *                       "\n"         == "" (i.e., returns 0)
 *                       <EOF>        == ""
 *                       [0-7]{1,3}   == the character represented by the code
 *                                       interpreted as an octal number.
 *                       [^ntb0-7\n]  == the same character.  I.e., "*" == '*'
 */

#define  is_octal_digit(c)           (((c)>='0') && ((c)<='7'))

static char
eat_escape_code(void)
{
    int c, coded_char;

    c = input();

    switch (c) {
      case 0:  /* i.e., EOF */
	unput(c);
	return(c);
      case '\n':
	return(0);
      case 'n':
	return('\n');
      case 't':
	return('\t');
      case 'b':
	return('\b');
      case '0':   case '1':   case '2':   case '3':
      case '4':   case '5':   case '6':   case '7':
	coded_char = c - '0';
	c = input();
	if (!is_octal_digit(c)) {
	    unput(c);
	    return(coded_char);
	}
	coded_char = coded_char*8 + c-'0';
	c = input();
	if (!is_octal_digit(c)) {
	    unput(c);
	    return(coded_char);
	}
	return(coded_char*8 + c-'0');
      default:
	return(c);
    }
}

/*
 * eat_string - this routine eats characters allowing escape codes via '\\'
 *              until a '"' is eaten.  If no '"' is seen before a '\n' or
 *              the <EOF>, a parse_error is set & 0 is returned.  Otherwise,
 *              the string represented by what has been eaten is returned.
 *              I.e., 'hello \n there"' would cause "hello \n there" to be
 *              returned.  (thats not a <cr> in the first case, a <cr> in the
 *              second)  The returned string is on the heap & must be freed
 *              eventually.  This routine should be passed the line # that the
 *              string we are eating started on.
 */

static char *
eat_string(int starting_line)
{
    int c;
    char buffer[500];
    char *ptr = buffer;

    for (;;) {
	/*
	 * Get the next input character, handling EOF:
	 */
	c = input();
	if (!c) {
	    unput(c);
	    report_parse_error("unterminated string found beginning",
			    starting_line);
	    return(0);
	}

	/*
	 * Deal with special characters ('\\', '"', and '\n'):
	 */
	if (c=='\\') {
	    c = eat_escape_code();
	    if (!c)
	      continue;
	} else if (c == '"') {
	    *ptr = 0;
	    return(string_Copy(buffer));
	} else if (c == '\n') {
	    unput(c);        /* fix line # reference to right line # */
	    report_parse_error("carriage return found in string", yylineno);
	    return(0);
	}

	/*
	 * Add the character c to the current string:
	 */
	*ptr = c;
	ptr++;

	/*
	 * If out of buffer space, do a recursive call then
	 * concatanate the result to the string read in so far to get the
	 * entire string and return that:
	 */
	if (ptr>buffer+sizeof(buffer)-20) {
	    string rest_of_string, result;

	    rest_of_string = eat_string(starting_line);
	    if (!rest_of_string)
	      return(0);

	    *ptr = 0;
	    result = string_Concat(buffer, rest_of_string);
	    free(rest_of_string);
	    return(result);
	}
    }
}

/*
 * eat_show_line - internal routine for eat_show:
 *
 *        This routine reads in a physical line of text allowing escape
 *    codes via '\\'.  If the line ends with a newline, the newline is eaten.
 *    If the line ends with a EOF, the EOF is not eaten.  The string
 *    represented by what has been eaten is returned.  The returned string
 *    is on the heap & must be freed eventually.  If test_for_endshow is
 *    true and the line read in starts off with "endshow" exactly
 *    (i.e., no escape codes) followed by any non-identifier-char, then
 *    instead of doing the above, we just eat the "endshow" & return 0.
 */

static char *
eat_show_line(int test_for_endshow)
{
    int c;
    int saw_escape_code = 0;
    int starting_line = yylineno;
    char buffer[200];      /* This must be large enough to hold "endshow" */
    char *ptr = buffer;

    while (yylineno == starting_line) {
	c = input();
	if (!c) {
	    unput(c);
	    *ptr = '\0';
	    return(string_Copy(buffer));
	} else if (c == '\\') {
	    saw_escape_code = 1;
	    c = eat_escape_code();
	    if (!c)
	      continue;
	}

	*ptr = c;
	ptr++;

	if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
	  if (!strncmp(buffer, "endshow", strlen("endshow"))
	      && !saw_escape_code) {
	      c = input();
	      unput(c);
	      if (!is_identifier_char(c))
		return(0);
	  }

	if (ptr>buffer+sizeof(buffer)-2) {
	    string the_line;
	    string rest_of_line = eat_show_line(0);

	    *ptr = '\0';
	    the_line = string_Concat(buffer, rest_of_line);
	    free(rest_of_line);
	    return(the_line);
	}
    }

    *ptr = '\0';
    return(string_Copy(buffer));
}

/*
 * eat_til_endshow - this routine eats characters allowing escape codes via
 *                   '\\' up to a endshow\{nonalpha} found at the
 *                   start of a line not counting leading whitespace.
 *                   If <EOF> is seen before the terminator, a parse_error
 *                   is set & 0 returned.  Otherwise, the string represented
 *                   by what has been eaten (escape codes replaced by what
 *                   they stand for and leading spaces and tabs removed from
 *                   each physical line) is returned.  The returned string
 *                   is on the heap & must be freed eventually.  Note that
 *                   to embed endshow in a message, endsho\w can be used.
 *                   This routine should be passed the line # of the show
 *                   command it is being used to process for use in error
 *                   messages.
 */

static char *
eat_til_endshow(int start_line_no)
{
    register int c;
    string text_so_far = string_Copy("");
    string next_line;

    for (;;) {
	/*
	 * Skip the spaces & tabs at the start of the current line:
	 */
	while ((c=input()), c==' ' || c=='\t') ;
	unput(c);

	/*
	 * Handle unterminated shows:
	 */
	if (!c) {
	    report_parse_error("unterminated show beginning", start_line_no);
	    free(text_so_far);
	    return(0);
	}

	/*
	 * Read in rest of the line (including the <cr> at end), allowing
	 * for escape codes and checking for "endshow{nonalpha}" at the
	 * start of the line.  (Note: \<newline> is considered the
	 * end of a line here!)
	 */
	next_line = eat_show_line(1);

	if (!next_line)  /* i.e., is this the endshow line? */
	  return(text_so_far);

	text_so_far = string_Concat2(text_so_far, next_line);
	free(next_line);
    }
}

/*
 * handle_show - this routine is called after "show"\{nonalpha} is
 *               found to handle up to the endshow.  The token # is
 *               returned.
 */

static int
handle_show(void)
{
    int c;
    int start_line_no = yylineno;

    /*
     * Eat up ' ' and '\t's after show.  If the next character is a newline,
     * eat it.  This is so we don't get an extra newline when we call
     * eat_til_endshow:
     */
    while (c=input(), c==' ' || c=='\t') ;
    if (c!='\n')
      unput(c);

    yylval.text = eat_til_endshow(start_line_no);
    if (yylval.text)
      return(SHOW);
    else
      return(ERROR);
}

/****************************************************************************/
/*                                                                          */
/*                         The main lexer itself:                           */
/*                                                                          */
/****************************************************************************/

/*
 * yylex - performs as per. the yacc manual's requirements
 */

int yylex(void)
{
    register int c, last_char;
    register char *ptr;
    int start_line_no;
    int_dictionary_binding *binding;
    char varname[MAX_IDENTIFIER_LENGTH+1];

    for (;;) {
	switch (c = input()) {

	    /*
	     * Skip whitespace:
	     */
	  case ' ':   case '\t':   case '\n':
	    continue;

	    /*
	     * '#' comments out everything up to the and including
	     * the next <cr>:
	     */
	  case '#':
	    while ( (c=input()) && (c!='\n') ) ;
	    if (!c)
	      unput(c);
	    continue;

	    /*
	     * Handle c-style comments.  Note that "/[^*]" is not the start
	     * of any valid token.
	     */
	  case '/':
	    start_line_no = yylineno;

	    /* verify that next character is a '*': */
	    if ((c=input()) != '*')
	      return(ERROR);

	    /* Scan until "*\/" or <EOF>: */
	    for (last_char=0; ; last_char=c) {
		c = input();
		if (c == '/' && (last_char=='*'))
		  break;
		if (!c) {
		    unput(c);
		    report_parse_error("unterminated c style comment found beginning", start_line_no);
		    return(ERROR);
		}
	    }
	    continue;

	    /*
	     * The following characters lex as themselves:
	     *   '+', '|', '&', '(', ')', '.', ',' and <EOF>:
	     */
	  case   0:   case '+':   case '|':   case '&':   case '(':
	  case ')':   case '.':	  case ',':
	    return(c);

	    /*
	     * Handle "=[^~=]", "=~", and "==":
	     */
	  case '=':
	    switch (c = input()) {
	      case '~':
		return(REGEQ);
	      case '=':
		return(EQ);
	      default:
		unput(c);
		return('=');
	    }

	    /*
	     * Handle "![^~=]", "!~", and "!=":
	     */
	  case '!':
	    switch (c = input()) {
	      case '~':
		return(REGNEQ);
	      case '=':
		return(NEQ);
	      default:
		unput(c);
		return('!');
	    }

	    /*
	     * Handle identifiers and keywords:
	     *
	     * Note that the below set of characters is hard coded from
	     * is_identifier_char from parser.h.
	     */
	  case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
	  case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
	  case 'k':   case 'l':   case 'm':   case 'n':   case 'o':
	  case 'p':   case 'q':   case 'r':   case 's':   case 't':
	  case 'u':   case 'v':   case 'w':   case 'x':   case 'y':
	  case 'z':
	  case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
	  case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
	  case 'K':   case 'L':   case 'M':   case 'N':   case 'O':
	  case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
	  case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
	  case 'Z':
	  case '0':   case '1':   case '2':   case '3':   case '4':
	  case '5':   case '6':   case '7':   case '8':   case '9':
	  case '_':
	    /*
	     * Read in the first MAX_IDENTIFIER_LENGTH characters of the
	     * identifier into varname null terminated.  Eat
	     * the rest of the characters of the identifier:
	     */
	    for (ptr = varname;;) {
		if (ptr<varname+MAX_IDENTIFIER_LENGTH)
		  *(ptr++) = c;
		c = input();
		if (!is_identifier_char(c))
		  break;
	    }
	    unput(c);
	    *ptr = '\0';

	    /*
	     * Look up the identifier in the keyword dictionary.
	     * If its a match, return the keyword's #.  In the case
	     * of show, call handle_show to do more processing.
	     * If not a match, treat as a variable name.
	     */
	    binding = int_dictionary_Lookup(keyword_dict, varname);
	    if (!binding) {
		yylval.text = string_Copy(varname);
		return(VARNAME);
	    }
	    if (binding->value == SHOW)
	      return(handle_show());
	    else
	      return(binding->value);

	    /*
	     * Handle "${identifier}".  Note that $ followed by a
	     * non-identifier character is not the start of any valid token.
	     */
	  case '$':
	    c = input();
	    if (!is_identifier_char(c))
	      return(ERROR);

	    /*
	     * Read in the first MAX_IDENTIFIER_LENGTH characters of the
	     * identifier into varname null terminated.  Eat
	     * the rest of the characters of the identifier:
	     */
	    for (ptr = varname;;) {
		if (ptr<varname+MAX_IDENTIFIER_LENGTH)
		  *(ptr++) = c;
		c = input();
		if (!is_identifier_char(c))
		  break;
	    }
	    unput(c);
	    *ptr = '\0';

	    yylval.text = string_Copy(varname);
	    return(VARREF);

	    /*
	     * Handle constant strings:
	     */
	  case '"':
	    yylval.text = eat_string(yylineno);
	    if (yylval.text)
	      return(STRING);
	    else
	      return(ERROR);

	    /*
	     * All other characters do not start valid tokens:
	     */
	  default:
	    return(ERROR);
	}
    }
}