Section 366: Expanding the next token

Only a dozen or so command codes > MAX_COMMAND can possibly be returned by get_next; in increasing order, they are UNDEFINED_CS, EXPAND_AFTER, NO_EXPAND, INPUT, IF_TEST, FI_OR_ELSE, CS_NAME, CONVERT, THE, TOP_BOT_MARK, CALL, LONG_CALL, OUTER_CALL, LONG_OUTER_CALL, and END_TEMPLATE.

The expand subroutine is used when cur_cmd MAX_COMMAND. It removes a “call” or a conditional or one of the other special operations just listed. It follows that expand might invoke itself recursively. In all cases, expand destroys the current token, but it sets things up so that the next get_next will deliver the appropriate next token. The value of cur_tok need not be known when expand is called.

Since several of the basic scanning routines communicate via global variables, their values are saved as local variables of expand so that recursive calls don’t invalidate them.

expand_next_token.c
// << Start file |expand_next_token.c|, 1382 >>

void expand() {
    halfword t;      // token that is being ``expanded after''
    pointer p, q, r; // for list manipulation
    int j;           // index into |buffer|
    int cv_backup;   // to save the global quantity |cur_val|
    small_number cvl_backup, radix_backup, co_backup; // to save |cur_val_level|, etc.
    pointer backup_backup; // to save |link(BACKUP_HEAD)|
    small_number save_scanner_status; // temporary storage of |scanner_status|
    
    cv_backup = cur_val;
    cvl_backup = cur_val_level;
    radix_backup = radix;
    co_backup = cur_order;
    backup_backup = link(BACKUP_HEAD);
    if (cur_cmd < CALL) {
        // << Expand a nonmacro, 367 >>
    }
    else if (cur_cmd < END_TEMPLATE) {
        macro_call();
    }
    else {
        // << Insert a token containing |FROZEN_ENDV|, 375 >>
    }
    cur_val = cv_backup;
    cur_val_level = cvl_backup;
    radix = radix_backup;
    cur_order = co_backup;
    link(BACKUP_HEAD) = backup_backup;
}

Section 367

⟨ Expand a nonmacro 367 ⟩≡

if (tracing_commands > 1) {
    show_cur_cmd_chr();
}
switch (cur_cmd) {
case TOP_BOT_MARK:
    // << Insert the appropriate mark text into the scanner, 386 >>
    break;

case EXPAND_AFTER:
    // << Expand the token after the next token, 368 >>
    break;

case NO_EXPAND:
    // << Suppress expansion of the next token, 369 >>
    break;

case CS_NAME:
    // << Manufacture a control sequence name, 372 >>
    break;

case CONVERT:
    conv_toks();
    break; // this procedure is discussed in Part 27 below

case THE:
    ins_the_toks();
    break; // this procedure is discussed in Part 27 below

case IF_TEST:
    conditional();
    break; // this procedure is discussed in Part 28 below

case FI_OR_ELSE:
    // << Terminate the current conditional and skip to \fi, 510 >>
    break;

case INPUT:
    // << Initiate or terminate input from a file, 378 >>
    break;

default:
    // << Complain about an undefined macro, 370 >>
}

Section 368

It takes only a little shuffling to do what calls \expandafter.

⟨ Expand the token after the next token 368 ⟩≡

get_token();
t = cur_tok;
get_token();
if (cur_cmd > MAX_COMMAND) {
    expand();
}
else {
    back_input();
}
cur_tok = t;
back_input();

Section 369

The implementation of \noexpand is a bit trickier, because it is necessary to insert a special ‘DONT_EXPAND’ marker into ’s reading mechanism. This special marker is processed by get_next, but it does not slow down the inner loop.

Since \outer macros might arise here, we must also clear the scanner_status temporarily.

⟨ Suppress expansion of the next token 369 ⟩≡

save_scanner_status = scanner_status;
scanner_status = NORMAL;
get_token();
scanner_status = save_scanner_status;
t = cur_tok;
back_input(); // now |start| and |loc| point to the backed-up token |t|
if (t >= CS_TOKEN_FLAG) {
    p = get_avail();
    info(p) = CS_TOKEN_FLAG + FROZEN_DONT_EXPAND;
    link(p) = loc;
    start = p;
    loc = p;
}

Section 370

⟨ Complain about an undefined macro 370 ⟩≡

print_err("Undefined control sequence");
help5("The control sequence at the end of the top line")
    ("of your error message was never \\def'ed. If you have")
    ("misspelled it (e.g., `\\hobx'), type `I' and the correct")
    ("spelling (e.g., `I\\hbox'). Otherwise just continue,")
    ("and I'll forget about whatever was undefined.");
error();

Section 371

The expand procedure and some other routines that construct token lists find it convenient to use the following macros, which are valid only if the variables p and q are reserved for token-list building.

parser.h
#define store_new_token(X) \
    q = get_avail();       \
    link(p) = q;           \
    info(q) = (X);         \
    p = q // |link(p)| is |null|

#define fast_store_new_token(X) \
    fast_get_avail(q);          \
    link(p) = q;                \
    info(q) = (X);              \
    p = q // |link(p)| is |null|

Section 372

⟨ Manufacture a control sequence name 372 ⟩≡

r = get_avail();
p = r; // head of the list of characters
do {
    get_x_token();
    if (cur_cs == 0) {
        store_new_token(cur_tok);
    }
} while (cur_cs == 0);
if (cur_cmd != END_CS_NAME) {
    // << Complain about missing \endcsname, 373 >>
}
// << Look up the characters of list |r| in the hash table, and set |cur_cs|, 374 >>
flush_list(r);
if (eq_type(cur_cs) == UNDEFINED_CS) {
    eq_define(cur_cs, RELAX, 256); // N.B.: The |save_stack| might change
} // the control sequence will now match '\relax'
cur_tok = cur_cs + CS_TOKEN_FLAG;
back_input();

Section 373

⟨ Complain about missing \endcsname 373 ⟩≡

print_err("Missing ");
print_esc("endcsname");
print(" inserted");
help2("The control sequence marked <to be read again> should")
    ("not appear between \\csname and \\endcsname.");
back_error();

Section 374

⟨ Look up the characters of list r in the hash table, and set cur_cs 374 ⟩≡

j = first;
p = link(r);
while (p != null) {
    if (j >= max_buf_stack) {
        max_buf_stack = j + 1;
        if (max_buf_stack == BUF_SIZE) {
            overflow("buffer size", BUF_SIZE);
        }
    }
    buffer[j] = info(p) % 256;
    incr(j);
    p = link(p);
}
if (j > first + 1) {
    no_new_control_sequence = false;
    cur_cs = id_lookup(first, j - first);
    no_new_control_sequence = true;
}
else if (j == first) {
    cur_cs = NULL_CS; // the list is empty
}
else {
    cur_cs = SINGLE_BASE + buffer[first]; // the list has length one
}

Section 375

An END_TEMPLATE command is effectively changed to an ENDV command by the following code. (The reason for this is discussed below; the FROZEN_END_TEMPLATE at the end of the template has passed the check_outer_validity test, so its mission of error detection has been accomplished.)

⟨ Insert a token containing FROZEN_ENDV 375 ⟩≡

cur_tok = CS_TOKEN_FLAG + FROZEN_ENDV;
back_input();

Section 376

The processing of \input involves the start_input subroutine, which will be declared later; the processing of \endinput is trivial.

⟨ Put each of TeX’s primitives into the hash table 226 ⟩+≡

primitive("input", INPUT, 0);
primitive("endinput", INPUT, 1);

Section 377

⟨ Cases of print_cmd_chr for symbolic printing of primitives 227 ⟩+≡

case INPUT:
    if (chr_code == 0) {
        print_esc("input");
    }
    else {
        print_esc("endinput");
    }
    break;

Section 378

⟨ Initiate or terminate input from a file 378 ⟩≡

if (cur_chr > 0) {
    force_eof = true;
}
else if (name_in_progress) {
    insert_relax();
}
else {
    start_input();
}

Section 379

Sometimes the expansion looks too far ahead, so we want to insert a harmless \relax into the user’s input.

expand_next_token.c
void insert_relax() {
    cur_tok = CS_TOKEN_FLAG + cur_cs;
    back_input();
    cur_tok = CS_TOKEN_FLAG + FROZEN_RELAX;
    back_input();
    token_type = INSERTED;
}

Section 380

Here is a recursive procedure that is ’s usual way to get the next token of input. It has been slightly optimized to take account of common cases.

expand_next_token.c
// sets |cur_cmd|, |cur_chr|, |cur_tok|, and expands macros
void get_x_token() {
    // restart:
    while(true) {
        get_next();
        if (cur_cmd <= MAX_COMMAND) {
            break; // Goto done
        }
        if (cur_cmd >= CALL) {
            if (cur_cmd < END_TEMPLATE) {
                macro_call();
            }
            else {
                cur_cs = FROZEN_ENDV;
                cur_cmd = ENDV;
                break; // Goto done; |cur_chr = NULL_LIST|
            }
        }
        else {
            expand();
        }
        // Goto restart
    }
    // done:
    if (cur_cs == 0) {
        cur_tok = (cur_cmd * 256) + cur_chr;
    }
    else {
        cur_tok = CS_TOKEN_FLAG + cur_cs;
    }
}

Section 381

The get_x_token procedure is essentially equivalent to two consecutive procedure calls: get_next, x_token.

expand_next_token.c
// |get_x_token| without the initial |get_next|
void x_token() {
    while (cur_cmd > MAX_COMMAND) {
        expand();
        get_next();
    }
    if (cur_cs == 0) {
        cur_tok = (cur_cmd * 256) + cur_chr;
    }
    else {
        cur_tok = CS_TOKEN_FLAG + cur_cs;
    }
}

Section 382

A control sequence that has been \def’ed by the user is expanded by ’s macro_call procedure.

Before we get into the details of macro_call, however, let’s consider the treatment of primitives like \topmark, since they are essentially macros without parameters. The token lists for such marks are kept in a global array of five pointers; we refer to the individual entries of this array by symbolic names top_mark, etc. The value of top_mark is either null or a pointer to the reference count of a token list.

constants.h
#define TOP_MARK_CODE         0 // the mark in effect at the previous page break
#define FIRST_MARK_CODE       1 // the first mark between |top_mark| and |bot_mark|
#define BOT_MARK_CODE         2 // the mark in effect at the current page break
#define SPLIT_FIRST_MARK_CODE 3 // the first mark found by \vsplit
#define SPLIT_BOT_MARK_CODE   4 // the last mark found by \vsplit
parser.h
#define top_mark         cur_mark[TOP_MARK_CODE]
#define first_mark       cur_mark[FIRST_MARK_CODE]
#define bot_mark         cur_mark[BOT_MARK_CODE]
#define split_first_mark cur_mark[SPLIT_FIRST_MARK_CODE]
#define split_bot_mark   cur_mark[SPLIT_BOT_MARK_CODE]

⟨ Global variables 13 ⟩+≡

pointer cur_mark[SPLIT_BOT_MARK_CODE + 1]; // token lists for marks

Section 383

⟨ Set initial values of key variables 21 ⟩+≡

top_mark = null;
first_mark = null;
bot_mark = null;
split_first_mark = null;
split_bot_mark = null;

Section 384

⟨ Put each of TeX’s primitives into the hash table 226 ⟩+≡

primitive("topmark", TOP_BOT_MARK, TOP_MARK_CODE);
primitive("firstmark", TOP_BOT_MARK, FIRST_MARK_CODE);
primitive("botmark", TOP_BOT_MARK, BOT_MARK_CODE);
primitive("splitfirstmark", TOP_BOT_MARK, SPLIT_FIRST_MARK_CODE);
primitive("splitbotmark", TOP_BOT_MARK, SPLIT_BOT_MARK_CODE);

Section 385

⟨ Cases of print_cmd_chr for symbolic printing of primitives 227 ⟩+≡

case TOP_BOT_MARK:
    switch (chr_code) {
    case FIRST_MARK_CODE:
        print_esc("firstmark");
        break;
    
    case BOT_MARK_CODE:
        print_esc("botmark");
        break;
    
    case SPLIT_FIRST_MARK_CODE:
        print_esc("splitfirstmark");
        break;
    
    case SPLIT_BOT_MARK_CODE:
        print_esc("splitbotmark");
        break;
    
    default:
        print_esc("topmark");
    } 
    break;

Section 386

The following code is activated when cur_cmd = TOP_BOT_MARK and when cur_chr is a code like TOP_MARK_CODE.

⟨ Insert the appropriate mark text into the scanner 386 ⟩≡

if (cur_mark[cur_chr] != null) {
    begin_token_list(cur_mark[cur_chr], MARK_TEXT);
}

Section 387

Now let’s consider MACRO_CALL itself, which is invoked when is scanning a control sequence whose cur_cmd is either CALL, LONG_CALL, OUTER_CALL, or LONG_OUTER_CALL. The control sequence definition appears in the token list whose reference count is in location cur_chr of mem.

The global variable long_state will be set to CALL or to LONG_CALL, depending on whether or not the control sequence disallows \par in its parameters. The get_next routine will set long_state to OUTER_CALL and emit \par, if a file ends or if an \outer control sequence occurs in the midst of an argument.

⟨ Global variables 13 ⟩+≡

int long_state; // governs the acceptance of \par

Section 388

The parameters, if any, must be scanned before the macro is expanded. Parameters are token lists without reference counts. They are placed on an auxiliary stack called pstack while they are being scanned, since the param_stack may be losing entries during the matching process. (Note that param_stack can’t be gaining entries, since macro_call is the only routine that puts anything onto param_stack, and it is not recursive.)

⟨ Global variables 13 ⟩+≡

pointer pstack[9]; // arguments supplied to a macro

Section 389

After parameter scanning is complete, the parameters are moved to the param_stack. Then the macro body is fed to the scanner; in other words, macro_call places the defined text of the control sequence at the top of ’s input stack, so that get_next will proceed to read it next.

The global variable cur_cs contains the eqtb address of the control sequence being expanded, when macro_call begins. If this control sequence has not been declared \long, i.e., if its command code in the eq_type field is not LONG_CALL or LONG_OUTER_CALL, its parameters are not allowed to contain the control sequence \par. If an illegal \par appears, the macro call is aborted, and the \par will be rescanned.

expand_next_token.c
// invokes a user-defined control sequence
void macro_call() {
    pointer r;                 // current node in the macro's token list
    pointer p = null;          // current node in parameter token list being built
    pointer q;                 // new node being put into the token list
    pointer s;                 // backup pointer for parameter matching
    pointer t;                 // cycle pointer for backup recovery
    pointer u, v;              // auxiliary pointers for backup recovery
    pointer rbrace_ptr = null; // one step before the last |RIGHT_BRACE| token
    small_number n;            // the number of parameters scanned
    halfword unbalance;        // unmatched left braces in current parameter
    halfword m = 0;            // the number of tokens or groups (usually)
    pointer ref_count;         // start of the token list
    small_number save_scanner_status; // |scanner_status| upon entry
    pointer save_warning_index;       // |warning_index| upon entry
    ASCII_code match_chr;             // character used in parameter
    
    save_scanner_status = scanner_status;
    save_warning_index = warning_index;
    warning_index = cur_cs;
    ref_count = cur_chr;
    r = link(ref_count);
    n = 0;
    if (tracing_macros > 0) {
        // << Show the text of the macro being expanded, 401 >>
    }
    if (info(r) != END_MATCH_TOKEN) {
        // << Scan the parameters and make |link(r)| point to the macro body; but |return| if an illegal \par is detected, 391 >>
    }
    // << Feed the macro body and its parameters to the scanner, 390 >>
end:
    scanner_status = save_scanner_status;
    warning_index = save_warning_index;
}

Section 390

Before we put a new token list on the input stack, it is wise to clean off all token lists that have recently been depleted. Then a user macro that ends with a call to itself will not require unbounded stack space.

⟨ Feed the macro body and its parameters to the scanner 390 ⟩≡

while (state == TOKEN_LIST
    && loc == null
    && token_type != V_TEMPLATE)
{
    end_token_list(); // conserve stack space
}
begin_token_list(ref_count, MACRO);
name = warning_index;
loc = link(r);
if (n > 0) {
    if (param_ptr + n > max_param_stack) {
        max_param_stack = param_ptr + n;
        if (max_param_stack > PARAM_SIZE) {
            overflow("parameter stack size", PARAM_SIZE);
        }
    }
    for(m = 0; m < n; m++) {
        param_stack[param_ptr + m] = pstack[m];
    }
    param_ptr += n;
}

Section 391

At this point, the reader will find it advisable to review the explanation of token list format that was presented earlier, since many aspects of that format are of importance chiefly in the macro_call routine.

The token list might begin with a string of compulsory tokens before the first MATCH or END_MATCH. In that case the macro name is supposed to be followed by those tokens; the following program will set s = null to represent this restriction. Otherwise s will be set to the first token of a string that will delimit the next parameter.

⟨ Scan the parameters and make link(r) point to the macro body; but return if an illegal \par is detected 391 ⟩≡

scanner_status = MATCHING;
unbalance = 0;
long_state = eq_type(cur_cs);
if (long_state >= OUTER_CALL) {
    long_state -= 2;
}
do {
    link(TEMP_HEAD) = null;
    if (info(r) > MATCH_TOKEN + 255 || info(r) < MATCH_TOKEN) {
        s = null;
    }
    else {
        match_chr = info(r) - MATCH_TOKEN;
        s = link(r);
        r = s;
        p = TEMP_HEAD;
        m = 0;
    }
    // << Scan a parameter until its delimiter string has been found; or, if |s = null|, simply scan the delimiter string, 392 >>
    // now |info(r)| is a token whose command code is either |match| or |end_match|
} while (info(r) != END_MATCH_TOKEN);

Section 392

If info(r) is a MATCH or END_MATCH command, it cannot be equal to any token found by get_token. Therefore an undelimited parameter—i.e., a MATCH that is immediately followed by MATCH or END_MATCH—will always fail the test ‘cur_tok = info(r)’ in the following algorithm.

⟨ Scan a parameter until its delimiter string has been found; or, if s = null, simply scan the delimiter string 392 ⟩≡

continue_lbl:
get_token(); // set |cur_tok| to the next token of input
if (cur_tok == info(r)) {
    // << Advance |r|; |goto found| if the parameter delimiter has been fully matched, otherwise |goto continue_lbl|, 394 >>
}
// << Contribute the recently matched tokens to the current parameter, and |goto continue| if a partial match is still in effect; but abort if |s = null|, 397 >>
if (cur_tok == par_token && long_state != LONG_CALL) {
    // << Report a runaway argument and abort, 396 >>
}
if (cur_tok < RIGHT_BRACE_LIMIT) {
    if (cur_tok < LEFT_BRACE_LIMIT) {
        // << Contribute an entire group to the current parameter, 399 >>
    }
    else {
        // << Report an extra right brace and |goto continue|, 395 >>
    }
}
else {
    // << Store the current token, but |goto continue_lbl| if it is a blank space that would become an undelimited parameter, 393 >>
}
incr(m);
if (info(r) > END_MATCH_TOKEN || info(r) < MATCH_TOKEN) {
    goto continue_lbl;
}
found:
if (s != null) {
    // << Tidy up the parameter just scanned, and tuck it away, 400 >>
}

Section 393

⟨ Store the current token, but goto continue_lbl if it is a blank space that would become an undelimited parameter 393 ⟩≡

if (cur_tok == SPACE_TOKEN
    && info(r) <= END_MATCH_TOKEN
    && info(r) >= MATCH_TOKEN)
{
    goto continue_lbl;
}
store_new_token(cur_tok);

Section 394

A slightly subtle point arises here: When the parameter delimiter ends with ‘#{’, the token list will have a left brace both before and after the END_MATCH. Only one of these should affect the align_state, but both will be scanned, so we must make a correction.

⟨ Advance r; goto found if the parameter delimiter has been fully matched, otherwise goto continue_lbl 394 ⟩≡

r = link(r);
if (info(r) >= MATCH_TOKEN && info(r) <= END_MATCH_TOKEN) {
    if (cur_tok < LEFT_BRACE_LIMIT) {
        decr(align_state);
    }
    goto found;
}
else {
    goto continue_lbl;
}

Section 395

⟨ Report an extra right brace and goto continue 395 ⟩≡

back_input();
print_err("Argument of ");
sprint_cs(warning_index);
print(" has an extra }");
help6("I've run across a `}' that doesn't seem to match anything.")
    ("For example, `\\def\\a#1{...}' and `\\a}' would produce")
    ("this error. If you simply proceed now, the `\\par' that")
    ("I've just inserted will cause me to report a runaway")
    ("argument that might be the root of the problem. But if")
    ("your `}' was spurious, just type `2' and it will go away.");
incr(align_state);
long_state = CALL;
cur_tok = par_token;
ins_error();
goto continue_lbl;
// a white lie; the \par won't always trigger a runaway

Section 396

If long_state = OUTER_CALL, a runaway argument has already been reported.

⟨ Report a runaway argument and abort 396 ⟩≡

if (long_state == CALL) {
    runaway();
    print_err("Paragraph ended before ");
    sprint_cs(warning_index);
    print(" was complete");
    help3("I suspect you've forgotten a `}', causing me to apply this")
        ("control sequence to too much text. How can we recover?")
        ("My plan is to forget the whole thing and hope for the best.");
    back_error();
}
pstack[n] = link(TEMP_HEAD);
align_state -= unbalance;
for(m = 0; m <= n; m++) {
    flush_list(pstack[m]);
}
goto end;

Section 397

When the following code becomes active, we have matched tokens from s to the predecessor of r, and we have found that cur_tok info(r). An interesting situation now presents itself: If the parameter is to be delimited by a string such as ‘ab’, and if we have scanned ‘aa’, we want to contribute one ‘a’ to the current parameter and resume looking for a ‘b’. The program must account for such partial matches and for others that can be quite complex. But most of the time we have s = r and nothing needs to be done.

Incidentally, it is possible for \par tokens to sneak in to certain parameters of non-\long macros. For example, consider a case like ‘\def\a#1\par!{...}’ where the first \par is not followed by an exclamation point. In such situations it does not seem appropriate to prohibit the \par, so keeps quiet about this bending of the rules.

⟨ Contribute the recently matched tokens to the current parameter, and goto continue if a partial match is still in effect; but abort if s = null 397 ⟩≡

if (s != r) {
    if (s == null) {
        // << Report an improper use of the macro and abort, 398 >>
    }
    else {
        t = s;
        do {
            store_new_token(info(t));
            incr(m);
            u = link(t);
            v = s;
            while(true) {
                if (u == r) {
                    if (cur_tok != info(v)) {
                        goto done;
                    }
                    else {
                        r = link(v);
                        goto continue_lbl;
                    }
                }
                if (info(u) != info(v)) {
                    goto done;
                }
                u = link(u);
                v = link(v);
            }
done:
            t = link(t);
        } while (t != r);
        r = s; // at this point, no tokens are recently matched
    }
}

Section 398

⟨ Report an improper use of the macro and abort 398 ⟩≡

print_err("Use of ");
sprint_cs(warning_index);
print(" doesn't match its definition");
help4("If you say, e.g., `\\def\\a1{...}', then you must always")
    ("put `1' after `\\a', since control sequence names are")
    ("made up of letters only. The macro here has not been")
    ("followed by the required stuff, so I'm ignoring it.");
error();
goto end;

Section 399

⟨ Contribute an entire group to the current parameter 399 ⟩≡

unbalance = 1;
while(true) {
    fast_store_new_token(cur_tok);
    get_token();
    if (cur_tok == par_token && long_state != LONG_CALL) {
        // << Report a runaway argument and abort, 396 >>
    }
    if (cur_tok < RIGHT_BRACE_LIMIT) {
        if (cur_tok < LEFT_BRACE_LIMIT) {
            incr(unbalance);
        }
        else {
            decr(unbalance);
            if (unbalance == 0) {
                goto done1;
            }
        }
    }
}
done1:
rbrace_ptr = p;
store_new_token(cur_tok);

Section 400

If the parameter consists of a single group enclosed in braces, we must strip off the enclosing braces. That’s why rbrace_ptr was introduced.

⟨ Tidy up the parameter just scanned, and tuck it away 400 ⟩≡

if (m == 1 && info(p) < RIGHT_BRACE_LIMIT) {
    link(rbrace_ptr) = null;
    free_avail(p);
    p = link(TEMP_HEAD);
    pstack[n] = link(p);
    free_avail(p);
}
else {
    pstack[n] = link(TEMP_HEAD);
}
incr(n);
if (tracing_macros > 0) {
    begin_diagnostic();
    print_nl_strnumber(match_chr);
    print_int(n);
    print("<-");
    show_token_list(pstack[n - 1], null, 1000);
    end_diagnostic(false);
}

Section 401

⟨ Show the text of the macro being expanded 401 ⟩≡

begin_diagnostic();
print_ln();
print_cs(warning_index);
token_show(ref_count);
end_diagnostic(false);