674 lines
18 KiB
C
674 lines
18 KiB
C
/* This file is part of the Project Athena Zephyr Notification System.
|
|
* It is one of the source files comprising zwgc, the Zephyr WindowGram
|
|
* client.
|
|
*
|
|
* Created by: Marc Horowitz <marc@athena.mit.edu>
|
|
*
|
|
* $Id$
|
|
*
|
|
* Copyright (c) 1989 by the Massachusetts Institute of Technology.
|
|
* For copying and distribution information, see the file
|
|
* "mit-copyright.h".
|
|
*/
|
|
|
|
#include <sysdep.h>
|
|
|
|
#if (!defined(lint) && !defined(SABER))
|
|
static const char rcsid_lexer_c[] = "$Id$";
|
|
#endif
|
|
|
|
#include <zephyr/mit-copyright.h>
|
|
|
|
/****************************************************************************/
|
|
/* */
|
|
/* The lexer for the zwgc description language: */
|
|
/* */
|
|
/****************************************************************************/
|
|
|
|
#include "new_memory.h"
|
|
#include "new_string.h"
|
|
#include "int_dictionary.h"
|
|
#include "lexer.h"
|
|
#include "parser.h"
|
|
#include "y.tab.h"
|
|
|
|
/*
|
|
* yylineno - this holds the current line # we are on. Updated automatically
|
|
* by input() and unput().
|
|
*/
|
|
|
|
int yylineno;
|
|
|
|
/*
|
|
* keyword_dict - this dictionary maps keyword names to their token numbers.
|
|
*/
|
|
|
|
static int_dictionary keyword_dict = NULL;
|
|
|
|
/****************************************************************************/
|
|
/* */
|
|
/* I/O functions: */
|
|
/* */
|
|
/****************************************************************************/
|
|
|
|
/*
|
|
* input_file - this holds the FILE pointer to the file currently being lexed.
|
|
*/
|
|
|
|
static FILE *input_file;
|
|
|
|
/*
|
|
* pushback - if not -1, holds a character that was pushed back by unput but
|
|
* not yet read by input.
|
|
*/
|
|
|
|
static int pushback = -1;
|
|
|
|
static char
|
|
input(void)
|
|
{
|
|
int c;
|
|
|
|
if (pushback != -1) {
|
|
c = pushback;
|
|
pushback = -1;
|
|
if (c=='\n')
|
|
yylineno++;
|
|
return(c);
|
|
}
|
|
|
|
c = getc(input_file);
|
|
if (c=='\n')
|
|
yylineno++;
|
|
if (c==EOF)
|
|
c = 0;
|
|
|
|
return(c);
|
|
}
|
|
|
|
static void
|
|
unput(int c)
|
|
{
|
|
#ifdef DEBUG
|
|
if (pushback != -1) {
|
|
printf("Attempt to push back 2 characters at one time!\n");
|
|
exit(1);
|
|
}
|
|
#endif
|
|
|
|
pushback = c;
|
|
if (c == '\n')
|
|
yylineno--;
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/* */
|
|
/* Initialization routines: */
|
|
/* */
|
|
/****************************************************************************/
|
|
|
|
struct keyword_info {
|
|
string keyword;
|
|
int keyword_number;
|
|
};
|
|
|
|
/*
|
|
* keywords - This table holds a copy of the mapping from keyword name to
|
|
* token number and is used to initialize keyword_dict:
|
|
*/
|
|
|
|
static struct keyword_info keywords[] = {
|
|
{ "and", '&' },
|
|
{ "appendport", APPENDPORT },
|
|
{ "buffer", BUFFER },
|
|
{ "break", BREAK },
|
|
{ "closeinput", CLOSEINPUT },
|
|
{ "closeoutput", CLOSEOUTPUT },
|
|
{ "closeport", CLOSEPORT },
|
|
{ "case", CASE },
|
|
{ "clearbuf", CLEARBUF },
|
|
{ "default", DEFAULT },
|
|
{ "do", DO },
|
|
{ "downcase", DOWNCASE },
|
|
{ "else", ELSE },
|
|
{ "elseif", ELSEIF },
|
|
{ "endcase", ENDCASE },
|
|
{ "endif", ENDIF },
|
|
{ "endwhile", ENDWHILE },
|
|
{ "exec", EXEC },
|
|
{ "execport", EXECPORT },
|
|
{ "exit", EXIT },
|
|
{ "fields", FIELDS },
|
|
{ "get", GET },
|
|
{ "getenv", GETENV },
|
|
{ "if", IF },
|
|
{ "inputport", INPUTPORT },
|
|
{ "lany", LANY },
|
|
{ "lbreak", LBREAK },
|
|
{ "lspan", LSPAN },
|
|
{ "match", MATCH },
|
|
{ "noop", NOOP },
|
|
{ "not", '!' },
|
|
{ "or", '|' },
|
|
{ "outputport", OUTPUTPORT },
|
|
{ "print", PRINT },
|
|
{ "protect", PROTECT },
|
|
{ "put", PUT },
|
|
{ "rany", RANY },
|
|
{ "rbreak", RBREAK },
|
|
{ "rspan", RSPAN },
|
|
{ "set", SET },
|
|
{ "show", SHOW },
|
|
{ "stylestrip", STYLESTRIP },
|
|
{ "substitute", SUBSTITUTE },
|
|
{ "then", THEN },
|
|
{ "upcase", UPCASE },
|
|
{ "while", WHILE },
|
|
{ "verbatim", VERBATIM },
|
|
{ "zvar", ZVAR } };
|
|
|
|
/*
|
|
* lex_open - this routine [re]initializes the lexer & prepares it to lex
|
|
* a file. Resets current line # to 1.
|
|
*/
|
|
|
|
void
|
|
lex_open(FILE *file)
|
|
{
|
|
/*
|
|
* Initialize I/O:
|
|
*/
|
|
input_file = file;
|
|
yylineno = 1;
|
|
pushback = -1;
|
|
|
|
/*
|
|
* Initialize keyword_dict from keywords if needed:
|
|
*/
|
|
if (!keyword_dict) {
|
|
unsigned int i;
|
|
|
|
keyword_dict = int_dictionary_Create(101);
|
|
|
|
for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
|
|
int_dictionary_Define(keyword_dict, keywords[i].keyword,
|
|
0)->value = keywords[i].keyword_number;
|
|
}
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/* */
|
|
/* lex subroutines: */
|
|
/* */
|
|
/****************************************************************************/
|
|
|
|
/*
|
|
* eat_escape_code - this rountine eats an escape code & returns the character
|
|
* it codes for or 0 if it codes for "".
|
|
* (an escape code is what follows a '\\' in a quoted
|
|
* string) Current escape codes are:
|
|
*
|
|
* "n" == '\n'
|
|
* "t" == '\t'
|
|
* "b" == '\b'
|
|
* "\n" == "" (i.e., returns 0)
|
|
* <EOF> == ""
|
|
* [0-7]{1,3} == the character represented by the code
|
|
* interpreted as an octal number.
|
|
* [^ntb0-7\n] == the same character. I.e., "*" == '*'
|
|
*/
|
|
|
|
#define is_octal_digit(c) (((c)>='0') && ((c)<='7'))
|
|
|
|
static char
|
|
eat_escape_code(void)
|
|
{
|
|
int c, coded_char;
|
|
|
|
c = input();
|
|
|
|
switch (c) {
|
|
case 0: /* i.e., EOF */
|
|
unput(c);
|
|
return(c);
|
|
case '\n':
|
|
return(0);
|
|
case 'n':
|
|
return('\n');
|
|
case 't':
|
|
return('\t');
|
|
case 'b':
|
|
return('\b');
|
|
case '0': case '1': case '2': case '3':
|
|
case '4': case '5': case '6': case '7':
|
|
coded_char = c - '0';
|
|
c = input();
|
|
if (!is_octal_digit(c)) {
|
|
unput(c);
|
|
return(coded_char);
|
|
}
|
|
coded_char = coded_char*8 + c-'0';
|
|
c = input();
|
|
if (!is_octal_digit(c)) {
|
|
unput(c);
|
|
return(coded_char);
|
|
}
|
|
return(coded_char*8 + c-'0');
|
|
default:
|
|
return(c);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* eat_string - this routine eats characters allowing escape codes via '\\'
|
|
* until a '"' is eaten. If no '"' is seen before a '\n' or
|
|
* the <EOF>, a parse_error is set & 0 is returned. Otherwise,
|
|
* the string represented by what has been eaten is returned.
|
|
* I.e., 'hello \n there"' would cause "hello \n there" to be
|
|
* returned. (thats not a <cr> in the first case, a <cr> in the
|
|
* second) The returned string is on the heap & must be freed
|
|
* eventually. This routine should be passed the line # that the
|
|
* string we are eating started on.
|
|
*/
|
|
|
|
static char *
|
|
eat_string(int starting_line)
|
|
{
|
|
int c;
|
|
char buffer[500];
|
|
char *ptr = buffer;
|
|
|
|
for (;;) {
|
|
/*
|
|
* Get the next input character, handling EOF:
|
|
*/
|
|
c = input();
|
|
if (!c) {
|
|
unput(c);
|
|
report_parse_error("unterminated string found beginning",
|
|
starting_line);
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
* Deal with special characters ('\\', '"', and '\n'):
|
|
*/
|
|
if (c=='\\') {
|
|
c = eat_escape_code();
|
|
if (!c)
|
|
continue;
|
|
} else if (c == '"') {
|
|
*ptr = 0;
|
|
return(string_Copy(buffer));
|
|
} else if (c == '\n') {
|
|
unput(c); /* fix line # reference to right line # */
|
|
report_parse_error("carriage return found in string", yylineno);
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
* Add the character c to the current string:
|
|
*/
|
|
*ptr = c;
|
|
ptr++;
|
|
|
|
/*
|
|
* If out of buffer space, do a recursive call then
|
|
* concatanate the result to the string read in so far to get the
|
|
* entire string and return that:
|
|
*/
|
|
if (ptr>buffer+sizeof(buffer)-20) {
|
|
string rest_of_string, result;
|
|
|
|
rest_of_string = eat_string(starting_line);
|
|
if (!rest_of_string)
|
|
return(0);
|
|
|
|
*ptr = 0;
|
|
result = string_Concat(buffer, rest_of_string);
|
|
free(rest_of_string);
|
|
return(result);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* eat_show_line - internal routine for eat_show:
|
|
*
|
|
* This routine reads in a physical line of text allowing escape
|
|
* codes via '\\'. If the line ends with a newline, the newline is eaten.
|
|
* If the line ends with a EOF, the EOF is not eaten. The string
|
|
* represented by what has been eaten is returned. The returned string
|
|
* is on the heap & must be freed eventually. If test_for_endshow is
|
|
* true and the line read in starts off with "endshow" exactly
|
|
* (i.e., no escape codes) followed by any non-identifier-char, then
|
|
* instead of doing the above, we just eat the "endshow" & return 0.
|
|
*/
|
|
|
|
static char *
|
|
eat_show_line(int test_for_endshow)
|
|
{
|
|
int c;
|
|
int saw_escape_code = 0;
|
|
int starting_line = yylineno;
|
|
char buffer[200]; /* This must be large enough to hold "endshow" */
|
|
char *ptr = buffer;
|
|
|
|
while (yylineno == starting_line) {
|
|
c = input();
|
|
if (!c) {
|
|
unput(c);
|
|
*ptr = '\0';
|
|
return(string_Copy(buffer));
|
|
} else if (c == '\\') {
|
|
saw_escape_code = 1;
|
|
c = eat_escape_code();
|
|
if (!c)
|
|
continue;
|
|
}
|
|
|
|
*ptr = c;
|
|
ptr++;
|
|
|
|
if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
|
|
if (!strncmp(buffer, "endshow", strlen("endshow"))
|
|
&& !saw_escape_code) {
|
|
c = input();
|
|
unput(c);
|
|
if (!is_identifier_char(c))
|
|
return(0);
|
|
}
|
|
|
|
if (ptr>buffer+sizeof(buffer)-2) {
|
|
string the_line;
|
|
string rest_of_line = eat_show_line(0);
|
|
|
|
*ptr = '\0';
|
|
the_line = string_Concat(buffer, rest_of_line);
|
|
free(rest_of_line);
|
|
return(the_line);
|
|
}
|
|
}
|
|
|
|
*ptr = '\0';
|
|
return(string_Copy(buffer));
|
|
}
|
|
|
|
/*
|
|
* eat_til_endshow - this routine eats characters allowing escape codes via
|
|
* '\\' up to a endshow\{nonalpha} found at the
|
|
* start of a line not counting leading whitespace.
|
|
* If <EOF> is seen before the terminator, a parse_error
|
|
* is set & 0 returned. Otherwise, the string represented
|
|
* by what has been eaten (escape codes replaced by what
|
|
* they stand for and leading spaces and tabs removed from
|
|
* each physical line) is returned. The returned string
|
|
* is on the heap & must be freed eventually. Note that
|
|
* to embed endshow in a message, endsho\w can be used.
|
|
* This routine should be passed the line # of the show
|
|
* command it is being used to process for use in error
|
|
* messages.
|
|
*/
|
|
|
|
static char *
|
|
eat_til_endshow(int start_line_no)
|
|
{
|
|
register int c;
|
|
string text_so_far = string_Copy("");
|
|
string next_line;
|
|
|
|
for (;;) {
|
|
/*
|
|
* Skip the spaces & tabs at the start of the current line:
|
|
*/
|
|
while ((c=input()), c==' ' || c=='\t') ;
|
|
unput(c);
|
|
|
|
/*
|
|
* Handle unterminated shows:
|
|
*/
|
|
if (!c) {
|
|
report_parse_error("unterminated show beginning", start_line_no);
|
|
free(text_so_far);
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
* Read in rest of the line (including the <cr> at end), allowing
|
|
* for escape codes and checking for "endshow{nonalpha}" at the
|
|
* start of the line. (Note: \<newline> is considered the
|
|
* end of a line here!)
|
|
*/
|
|
next_line = eat_show_line(1);
|
|
|
|
if (!next_line) /* i.e., is this the endshow line? */
|
|
return(text_so_far);
|
|
|
|
text_so_far = string_Concat2(text_so_far, next_line);
|
|
free(next_line);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* handle_show - this routine is called after "show"\{nonalpha} is
|
|
* found to handle up to the endshow. The token # is
|
|
* returned.
|
|
*/
|
|
|
|
static int
|
|
handle_show(void)
|
|
{
|
|
int c;
|
|
int start_line_no = yylineno;
|
|
|
|
/*
|
|
* Eat up ' ' and '\t's after show. If the next character is a newline,
|
|
* eat it. This is so we don't get an extra newline when we call
|
|
* eat_til_endshow:
|
|
*/
|
|
while (c=input(), c==' ' || c=='\t') ;
|
|
if (c!='\n')
|
|
unput(c);
|
|
|
|
yylval.text = eat_til_endshow(start_line_no);
|
|
if (yylval.text)
|
|
return(SHOW);
|
|
else
|
|
return(ERROR);
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/* */
|
|
/* The main lexer itself: */
|
|
/* */
|
|
/****************************************************************************/
|
|
|
|
/*
|
|
* yylex - performs as per. the yacc manual's requirements
|
|
*/
|
|
|
|
int yylex(void)
|
|
{
|
|
register int c, last_char;
|
|
register char *ptr;
|
|
int start_line_no;
|
|
int_dictionary_binding *binding;
|
|
char varname[MAX_IDENTIFIER_LENGTH+1];
|
|
|
|
for (;;) {
|
|
switch (c = input()) {
|
|
|
|
/*
|
|
* Skip whitespace:
|
|
*/
|
|
case ' ': case '\t': case '\n':
|
|
continue;
|
|
|
|
/*
|
|
* '#' comments out everything up to the and including
|
|
* the next <cr>:
|
|
*/
|
|
case '#':
|
|
while ( (c=input()) && (c!='\n') ) ;
|
|
if (!c)
|
|
unput(c);
|
|
continue;
|
|
|
|
/*
|
|
* Handle c-style comments. Note that "/[^*]" is not the start
|
|
* of any valid token.
|
|
*/
|
|
case '/':
|
|
start_line_no = yylineno;
|
|
|
|
/* verify that next character is a '*': */
|
|
if ((c=input()) != '*')
|
|
return(ERROR);
|
|
|
|
/* Scan until "*\/" or <EOF>: */
|
|
for (last_char=0; ; last_char=c) {
|
|
c = input();
|
|
if (c == '/' && (last_char=='*'))
|
|
break;
|
|
if (!c) {
|
|
unput(c);
|
|
report_parse_error("unterminated c style comment found beginning", start_line_no);
|
|
return(ERROR);
|
|
}
|
|
}
|
|
continue;
|
|
|
|
/*
|
|
* The following characters lex as themselves:
|
|
* '+', '|', '&', '(', ')', '.', ',' and <EOF>:
|
|
*/
|
|
case 0: case '+': case '|': case '&': case '(':
|
|
case ')': case '.': case ',':
|
|
return(c);
|
|
|
|
/*
|
|
* Handle "=[^~=]", "=~", and "==":
|
|
*/
|
|
case '=':
|
|
switch (c = input()) {
|
|
case '~':
|
|
return(REGEQ);
|
|
case '=':
|
|
return(EQ);
|
|
default:
|
|
unput(c);
|
|
return('=');
|
|
}
|
|
|
|
/*
|
|
* Handle "![^~=]", "!~", and "!=":
|
|
*/
|
|
case '!':
|
|
switch (c = input()) {
|
|
case '~':
|
|
return(REGNEQ);
|
|
case '=':
|
|
return(NEQ);
|
|
default:
|
|
unput(c);
|
|
return('!');
|
|
}
|
|
|
|
/*
|
|
* Handle identifiers and keywords:
|
|
*
|
|
* Note that the below set of characters is hard coded from
|
|
* is_identifier_char from parser.h.
|
|
*/
|
|
case 'a': case 'b': case 'c': case 'd': case 'e':
|
|
case 'f': case 'g': case 'h': case 'i': case 'j':
|
|
case 'k': case 'l': case 'm': case 'n': case 'o':
|
|
case 'p': case 'q': case 'r': case 's': case 't':
|
|
case 'u': case 'v': case 'w': case 'x': case 'y':
|
|
case 'z':
|
|
case 'A': case 'B': case 'C': case 'D': case 'E':
|
|
case 'F': case 'G': case 'H': case 'I': case 'J':
|
|
case 'K': case 'L': case 'M': case 'N': case 'O':
|
|
case 'P': case 'Q': case 'R': case 'S': case 'T':
|
|
case 'U': case 'V': case 'W': case 'X': case 'Y':
|
|
case 'Z':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
case '_':
|
|
/*
|
|
* Read in the first MAX_IDENTIFIER_LENGTH characters of the
|
|
* identifier into varname null terminated. Eat
|
|
* the rest of the characters of the identifier:
|
|
*/
|
|
for (ptr = varname;;) {
|
|
if (ptr<varname+MAX_IDENTIFIER_LENGTH)
|
|
*(ptr++) = c;
|
|
c = input();
|
|
if (!is_identifier_char(c))
|
|
break;
|
|
}
|
|
unput(c);
|
|
*ptr = '\0';
|
|
|
|
/*
|
|
* Look up the identifier in the keyword dictionary.
|
|
* If its a match, return the keyword's #. In the case
|
|
* of show, call handle_show to do more processing.
|
|
* If not a match, treat as a variable name.
|
|
*/
|
|
binding = int_dictionary_Lookup(keyword_dict, varname);
|
|
if (!binding) {
|
|
yylval.text = string_Copy(varname);
|
|
return(VARNAME);
|
|
}
|
|
if (binding->value == SHOW)
|
|
return(handle_show());
|
|
else
|
|
return(binding->value);
|
|
|
|
/*
|
|
* Handle "${identifier}". Note that $ followed by a
|
|
* non-identifier character is not the start of any valid token.
|
|
*/
|
|
case '$':
|
|
c = input();
|
|
if (!is_identifier_char(c))
|
|
return(ERROR);
|
|
|
|
/*
|
|
* Read in the first MAX_IDENTIFIER_LENGTH characters of the
|
|
* identifier into varname null terminated. Eat
|
|
* the rest of the characters of the identifier:
|
|
*/
|
|
for (ptr = varname;;) {
|
|
if (ptr<varname+MAX_IDENTIFIER_LENGTH)
|
|
*(ptr++) = c;
|
|
c = input();
|
|
if (!is_identifier_char(c))
|
|
break;
|
|
}
|
|
unput(c);
|
|
*ptr = '\0';
|
|
|
|
yylval.text = string_Copy(varname);
|
|
return(VARREF);
|
|
|
|
/*
|
|
* Handle constant strings:
|
|
*/
|
|
case '"':
|
|
yylval.text = eat_string(yylineno);
|
|
if (yylval.text)
|
|
return(STRING);
|
|
else
|
|
return(ERROR);
|
|
|
|
/*
|
|
* All other characters do not start valid tokens:
|
|
*/
|
|
default:
|
|
return(ERROR);
|
|
}
|
|
}
|
|
}
|