674 lines
18 KiB
C

/* This file is part of the Project Athena Zephyr Notification System.
* It is one of the source files comprising zwgc, the Zephyr WindowGram
* client.
*
* Created by: Marc Horowitz <marc@athena.mit.edu>
*
* $Id$
*
* Copyright (c) 1989 by the Massachusetts Institute of Technology.
* For copying and distribution information, see the file
* "mit-copyright.h".
*/
#include <sysdep.h>
#if (!defined(lint) && !defined(SABER))
static const char rcsid_lexer_c[] = "$Id$";
#endif
#include <zephyr/mit-copyright.h>
/****************************************************************************/
/* */
/* The lexer for the zwgc description language: */
/* */
/****************************************************************************/
#include "new_memory.h"
#include "new_string.h"
#include "int_dictionary.h"
#include "lexer.h"
#include "parser.h"
#include "y.tab.h"
/*
* yylineno - this holds the current line # we are on. Updated automatically
* by input() and unput().
*/
int yylineno;
/*
* keyword_dict - this dictionary maps keyword names to their token numbers.
*/
static int_dictionary keyword_dict = NULL;
/****************************************************************************/
/* */
/* I/O functions: */
/* */
/****************************************************************************/
/*
* input_file - this holds the FILE pointer to the file currently being lexed.
*/
static FILE *input_file;
/*
* pushback - if not -1, holds a character that was pushed back by unput but
* not yet read by input.
*/
static int pushback = -1;
static char
input(void)
{
int c;
if (pushback != -1) {
c = pushback;
pushback = -1;
if (c=='\n')
yylineno++;
return(c);
}
c = getc(input_file);
if (c=='\n')
yylineno++;
if (c==EOF)
c = 0;
return(c);
}
static void
unput(int c)
{
#ifdef DEBUG
if (pushback != -1) {
printf("Attempt to push back 2 characters at one time!\n");
exit(1);
}
#endif
pushback = c;
if (c == '\n')
yylineno--;
}
/****************************************************************************/
/* */
/* Initialization routines: */
/* */
/****************************************************************************/
struct keyword_info {
string keyword;
int keyword_number;
};
/*
* keywords - This table holds a copy of the mapping from keyword name to
* token number and is used to initialize keyword_dict:
*/
static struct keyword_info keywords[] = {
{ "and", '&' },
{ "appendport", APPENDPORT },
{ "buffer", BUFFER },
{ "break", BREAK },
{ "closeinput", CLOSEINPUT },
{ "closeoutput", CLOSEOUTPUT },
{ "closeport", CLOSEPORT },
{ "case", CASE },
{ "clearbuf", CLEARBUF },
{ "default", DEFAULT },
{ "do", DO },
{ "downcase", DOWNCASE },
{ "else", ELSE },
{ "elseif", ELSEIF },
{ "endcase", ENDCASE },
{ "endif", ENDIF },
{ "endwhile", ENDWHILE },
{ "exec", EXEC },
{ "execport", EXECPORT },
{ "exit", EXIT },
{ "fields", FIELDS },
{ "get", GET },
{ "getenv", GETENV },
{ "if", IF },
{ "inputport", INPUTPORT },
{ "lany", LANY },
{ "lbreak", LBREAK },
{ "lspan", LSPAN },
{ "match", MATCH },
{ "noop", NOOP },
{ "not", '!' },
{ "or", '|' },
{ "outputport", OUTPUTPORT },
{ "print", PRINT },
{ "protect", PROTECT },
{ "put", PUT },
{ "rany", RANY },
{ "rbreak", RBREAK },
{ "rspan", RSPAN },
{ "set", SET },
{ "show", SHOW },
{ "stylestrip", STYLESTRIP },
{ "substitute", SUBSTITUTE },
{ "then", THEN },
{ "upcase", UPCASE },
{ "while", WHILE },
{ "verbatim", VERBATIM },
{ "zvar", ZVAR } };
/*
* lex_open - this routine [re]initializes the lexer & prepares it to lex
* a file. Resets current line # to 1.
*/
void
lex_open(FILE *file)
{
/*
* Initialize I/O:
*/
input_file = file;
yylineno = 1;
pushback = -1;
/*
* Initialize keyword_dict from keywords if needed:
*/
if (!keyword_dict) {
unsigned int i;
keyword_dict = int_dictionary_Create(101);
for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
int_dictionary_Define(keyword_dict, keywords[i].keyword,
0)->value = keywords[i].keyword_number;
}
}
/****************************************************************************/
/* */
/* lex subroutines: */
/* */
/****************************************************************************/
/*
* eat_escape_code - this rountine eats an escape code & returns the character
* it codes for or 0 if it codes for "".
* (an escape code is what follows a '\\' in a quoted
* string) Current escape codes are:
*
* "n" == '\n'
* "t" == '\t'
* "b" == '\b'
* "\n" == "" (i.e., returns 0)
* <EOF> == ""
* [0-7]{1,3} == the character represented by the code
* interpreted as an octal number.
* [^ntb0-7\n] == the same character. I.e., "*" == '*'
*/
#define is_octal_digit(c) (((c)>='0') && ((c)<='7'))
static char
eat_escape_code(void)
{
int c, coded_char;
c = input();
switch (c) {
case 0: /* i.e., EOF */
unput(c);
return(c);
case '\n':
return(0);
case 'n':
return('\n');
case 't':
return('\t');
case 'b':
return('\b');
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
coded_char = c - '0';
c = input();
if (!is_octal_digit(c)) {
unput(c);
return(coded_char);
}
coded_char = coded_char*8 + c-'0';
c = input();
if (!is_octal_digit(c)) {
unput(c);
return(coded_char);
}
return(coded_char*8 + c-'0');
default:
return(c);
}
}
/*
* eat_string - this routine eats characters allowing escape codes via '\\'
* until a '"' is eaten. If no '"' is seen before a '\n' or
* the <EOF>, a parse_error is set & 0 is returned. Otherwise,
* the string represented by what has been eaten is returned.
* I.e., 'hello \n there"' would cause "hello \n there" to be
* returned. (thats not a <cr> in the first case, a <cr> in the
* second) The returned string is on the heap & must be freed
* eventually. This routine should be passed the line # that the
* string we are eating started on.
*/
static char *
eat_string(int starting_line)
{
int c;
char buffer[500];
char *ptr = buffer;
for (;;) {
/*
* Get the next input character, handling EOF:
*/
c = input();
if (!c) {
unput(c);
report_parse_error("unterminated string found beginning",
starting_line);
return(0);
}
/*
* Deal with special characters ('\\', '"', and '\n'):
*/
if (c=='\\') {
c = eat_escape_code();
if (!c)
continue;
} else if (c == '"') {
*ptr = 0;
return(string_Copy(buffer));
} else if (c == '\n') {
unput(c); /* fix line # reference to right line # */
report_parse_error("carriage return found in string", yylineno);
return(0);
}
/*
* Add the character c to the current string:
*/
*ptr = c;
ptr++;
/*
* If out of buffer space, do a recursive call then
* concatanate the result to the string read in so far to get the
* entire string and return that:
*/
if (ptr>buffer+sizeof(buffer)-20) {
string rest_of_string, result;
rest_of_string = eat_string(starting_line);
if (!rest_of_string)
return(0);
*ptr = 0;
result = string_Concat(buffer, rest_of_string);
free(rest_of_string);
return(result);
}
}
}
/*
* eat_show_line - internal routine for eat_show:
*
* This routine reads in a physical line of text allowing escape
* codes via '\\'. If the line ends with a newline, the newline is eaten.
* If the line ends with a EOF, the EOF is not eaten. The string
* represented by what has been eaten is returned. The returned string
* is on the heap & must be freed eventually. If test_for_endshow is
* true and the line read in starts off with "endshow" exactly
* (i.e., no escape codes) followed by any non-identifier-char, then
* instead of doing the above, we just eat the "endshow" & return 0.
*/
static char *
eat_show_line(int test_for_endshow)
{
int c;
int saw_escape_code = 0;
int starting_line = yylineno;
char buffer[200]; /* This must be large enough to hold "endshow" */
char *ptr = buffer;
while (yylineno == starting_line) {
c = input();
if (!c) {
unput(c);
*ptr = '\0';
return(string_Copy(buffer));
} else if (c == '\\') {
saw_escape_code = 1;
c = eat_escape_code();
if (!c)
continue;
}
*ptr = c;
ptr++;
if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
if (!strncmp(buffer, "endshow", strlen("endshow"))
&& !saw_escape_code) {
c = input();
unput(c);
if (!is_identifier_char(c))
return(0);
}
if (ptr>buffer+sizeof(buffer)-2) {
string the_line;
string rest_of_line = eat_show_line(0);
*ptr = '\0';
the_line = string_Concat(buffer, rest_of_line);
free(rest_of_line);
return(the_line);
}
}
*ptr = '\0';
return(string_Copy(buffer));
}
/*
* eat_til_endshow - this routine eats characters allowing escape codes via
* '\\' up to a endshow\{nonalpha} found at the
* start of a line not counting leading whitespace.
* If <EOF> is seen before the terminator, a parse_error
* is set & 0 returned. Otherwise, the string represented
* by what has been eaten (escape codes replaced by what
* they stand for and leading spaces and tabs removed from
* each physical line) is returned. The returned string
* is on the heap & must be freed eventually. Note that
* to embed endshow in a message, endsho\w can be used.
* This routine should be passed the line # of the show
* command it is being used to process for use in error
* messages.
*/
static char *
eat_til_endshow(int start_line_no)
{
register int c;
string text_so_far = string_Copy("");
string next_line;
for (;;) {
/*
* Skip the spaces & tabs at the start of the current line:
*/
while ((c=input()), c==' ' || c=='\t') ;
unput(c);
/*
* Handle unterminated shows:
*/
if (!c) {
report_parse_error("unterminated show beginning", start_line_no);
free(text_so_far);
return(0);
}
/*
* Read in rest of the line (including the <cr> at end), allowing
* for escape codes and checking for "endshow{nonalpha}" at the
* start of the line. (Note: \<newline> is considered the
* end of a line here!)
*/
next_line = eat_show_line(1);
if (!next_line) /* i.e., is this the endshow line? */
return(text_so_far);
text_so_far = string_Concat2(text_so_far, next_line);
free(next_line);
}
}
/*
* handle_show - this routine is called after "show"\{nonalpha} is
* found to handle up to the endshow. The token # is
* returned.
*/
static int
handle_show(void)
{
int c;
int start_line_no = yylineno;
/*
* Eat up ' ' and '\t's after show. If the next character is a newline,
* eat it. This is so we don't get an extra newline when we call
* eat_til_endshow:
*/
while (c=input(), c==' ' || c=='\t') ;
if (c!='\n')
unput(c);
yylval.text = eat_til_endshow(start_line_no);
if (yylval.text)
return(SHOW);
else
return(ERROR);
}
/****************************************************************************/
/* */
/* The main lexer itself: */
/* */
/****************************************************************************/
/*
* yylex - performs as per. the yacc manual's requirements
*/
int yylex(void)
{
register int c, last_char;
register char *ptr;
int start_line_no;
int_dictionary_binding *binding;
char varname[MAX_IDENTIFIER_LENGTH+1];
for (;;) {
switch (c = input()) {
/*
* Skip whitespace:
*/
case ' ': case '\t': case '\n':
continue;
/*
* '#' comments out everything up to the and including
* the next <cr>:
*/
case '#':
while ( (c=input()) && (c!='\n') ) ;
if (!c)
unput(c);
continue;
/*
* Handle c-style comments. Note that "/[^*]" is not the start
* of any valid token.
*/
case '/':
start_line_no = yylineno;
/* verify that next character is a '*': */
if ((c=input()) != '*')
return(ERROR);
/* Scan until "*\/" or <EOF>: */
for (last_char=0; ; last_char=c) {
c = input();
if (c == '/' && (last_char=='*'))
break;
if (!c) {
unput(c);
report_parse_error("unterminated c style comment found beginning", start_line_no);
return(ERROR);
}
}
continue;
/*
* The following characters lex as themselves:
* '+', '|', '&', '(', ')', '.', ',' and <EOF>:
*/
case 0: case '+': case '|': case '&': case '(':
case ')': case '.': case ',':
return(c);
/*
* Handle "=[^~=]", "=~", and "==":
*/
case '=':
switch (c = input()) {
case '~':
return(REGEQ);
case '=':
return(EQ);
default:
unput(c);
return('=');
}
/*
* Handle "![^~=]", "!~", and "!=":
*/
case '!':
switch (c = input()) {
case '~':
return(REGNEQ);
case '=':
return(NEQ);
default:
unput(c);
return('!');
}
/*
* Handle identifiers and keywords:
*
* Note that the below set of characters is hard coded from
* is_identifier_char from parser.h.
*/
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case '_':
/*
* Read in the first MAX_IDENTIFIER_LENGTH characters of the
* identifier into varname null terminated. Eat
* the rest of the characters of the identifier:
*/
for (ptr = varname;;) {
if (ptr<varname+MAX_IDENTIFIER_LENGTH)
*(ptr++) = c;
c = input();
if (!is_identifier_char(c))
break;
}
unput(c);
*ptr = '\0';
/*
* Look up the identifier in the keyword dictionary.
* If its a match, return the keyword's #. In the case
* of show, call handle_show to do more processing.
* If not a match, treat as a variable name.
*/
binding = int_dictionary_Lookup(keyword_dict, varname);
if (!binding) {
yylval.text = string_Copy(varname);
return(VARNAME);
}
if (binding->value == SHOW)
return(handle_show());
else
return(binding->value);
/*
* Handle "${identifier}". Note that $ followed by a
* non-identifier character is not the start of any valid token.
*/
case '$':
c = input();
if (!is_identifier_char(c))
return(ERROR);
/*
* Read in the first MAX_IDENTIFIER_LENGTH characters of the
* identifier into varname null terminated. Eat
* the rest of the characters of the identifier:
*/
for (ptr = varname;;) {
if (ptr<varname+MAX_IDENTIFIER_LENGTH)
*(ptr++) = c;
c = input();
if (!is_identifier_char(c))
break;
}
unput(c);
*ptr = '\0';
yylval.text = string_Copy(varname);
return(VARREF);
/*
* Handle constant strings:
*/
case '"':
yylval.text = eat_string(yylineno);
if (yylval.text)
return(STRING);
else
return(ERROR);
/*
* All other characters do not start valid tokens:
*/
default:
return(ERROR);
}
}
}