glib scanner

Re: [gtk-list] Glib Lexical Scanner
From: Tim Janik <timj gtk org>
To: Gtk List <gtk-list redhat com>
cc: Emmanuel DELOGET <logout free fr>, Trog <trog gtk org>
Subject: Re: [gtk-list] Glib Lexical Scanner
Date: Fri, 27 Aug 1999 02:58:41 +0200 (CEST)
On Thu, 26 Aug 1999, Geert Bevin wrote:

> Hello,
>
> I'm looking for documentation or simple examples about how to use the
> Glib Lexical Scanner. I've been using the Gnome Config functions up to
> now, but I'd like to make the filemanager I'm writing glib/gtk+ only and
> using the expat xml library for simple rc files seems a bit of an
> overkill.
>
> Anyone has some material about this,

hi geert,

here are three emails from my archive that'll hopefully sheed some light
on the gscanner issue for you. feel free to post more questions about it.

[Emmanuel and Trog, can we have something like this in the FAQ
or the tutorial please? i'd like to avoid rehashing of this over
and over again.]

>
> Thanks a lot,
>
>
> Geert.
>

---
ciaoTJ

Date: Fri, 5 Mar 1999 19:25:24 +0100 (CET)
From: Tim Janik <[email protected]>
To: GTK User-List <[email protected]>
Subject: [gtk-list] Re: g_scanner funtions what for?

On Fri, 5 Mar 1999, Andreas Tille wrote:

> Hello,
>
> I have to read some certain values from a textfile. The values
> are stored as:
>
> <key>: <value>
>
> I wonder if the (undocumented :-() g_scanner functions are my friend
> or if I have to write my own gets/scanf stuff (ore use older things
> from former projects). Can anybody give me a hint if it is possible
> to to it in "The GTK+ Way" to have a clean GTK+ API or if these
> functions are intended to solve other things. I tried to find
> a sense from gtkrc.c but failed understanding it.

a GScanner will tokenize your text, that is, it'll return an integer
for every word or number that appears in its input stream, following
certain (customizable) rules to perform this translation.
you still need to write the parsing functions on your own though.
here's a little test program that will parse

<SYMBOL> = <OPTIONAL-MINUS> <NUMBER> ;

constructs, while skipping "#\n" and "/**/" style comments.

#include <glib.h>

/* some test text to be fed into the scanner */
static const gchar *test_text =
( "ping = 5;\n"
"/* slide in some \n"
" * comments, just for the\n"
" * fun of it \n"
" */\n"
"pong = -6; \n"
"\n"
"# the next value is a float\n"
"zonk = 0.7;\n"
"# redefine ping\n"
"ping = - 0.5;\n" );

/* define enumeration values to be returned for specific symbols */
enum {
SYMBOL_PING = G_TOKEN_LAST + 1,
SYMBOL_PONG = G_TOKEN_LAST + 2,
SYMBOL_ZONK = G_TOKEN_LAST + 3
};

/* symbol array */
static const struct {
gchar *symbol_name;
guint symbol_token;
} symbols[] = {
{ "ping", SYMBOL_PING, },
{ "pong", SYMBOL_PONG, },
{ "zonk", SYMBOL_ZONK, },
{ NULL, 0, },
}, *symbol_p = symbols;

static gfloat ping = 0;
static gfloat pong = 0;
static gfloat zonk = 0;

static guint
parse_symbol (GScanner *scanner)
{
guint symbol;
gboolean negate = FALSE;

/* expect a valid symbol */
g_scanner_get_next_token (scanner);
symbol = scanner->token;
if (symbol < SYMBOL_PING ||
symbol > SYMBOL_ZONK)
return G_TOKEN_SYMBOL;

/* expect '=' */
g_scanner_get_next_token (scanner);
if (scanner->token != '=')
return '=';

/* feature optional '-' */
g_scanner_peek_next_token (scanner);
if (scanner->next_token == '-')
{
g_scanner_get_next_token (scanner);
negate = !negate;
}

/* expect a float (ints are converted to floats on the fly) */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_FLOAT)
return G_TOKEN_FLOAT;

/* make sure the next token is a ';' */
if (g_scanner_peek_next_token (scanner) != ';')
{
/* not so, eat up the non-semicolon and error out */
g_scanner_get_next_token (scanner);
return ';';
}

/* assign value, eat the semicolon and exit successfully */
switch (symbol)
{
case SYMBOL_PING:
ping = negate ? - scanner->value.v_float : scanner->value.v_float;
break;
case SYMBOL_PONG:
pong = negate ? - scanner->value.v_float : scanner->value.v_float;
break;
case SYMBOL_ZONK:
zonk = negate ? - scanner->value.v_float : scanner->value.v_float;
break;
}
g_scanner_get_next_token (scanner);

return G_TOKEN_NONE;
}

int
main (int argc, char *argv[])
{
GScanner *scanner;
guint expected_token;

scanner = g_scanner_new (NULL);

/* adjust lexing behaviour to suit our needs
*/
/* convert non-floats (octal values, hex values...) to G_TOKEN_INT */
scanner->config->numbers_2_int = TRUE;
/* convert G_TOKEN_INT to G_TOKEN_FLOAT */
scanner->config->int_2_float = TRUE;
/* don't return G_TOKEN_SYMBOL, but the symbol's value */
scanner->config->symbol_2_token = TRUE;

/* load symbols into the scanner */
while (symbol_p->symbol_name)
{
g_scanner_add_symbol (scanner,
symbol_p->symbol_name,
GINT_TO_POINTER (symbol_p->symbol_token));
symbol_p++;
}

/* feed in the text */
g_scanner_input_text (scanner, test_text, strlen (test_text));

/* give the error handler an idea on how the input is named */
scanner->input_name = "test text";

/* scanning loop, we parse the input untill it's end is reached,
* the scanner encountered a lexing error, or our sub routine came
* across invalid syntax
*/
do
{
expected_token = parse_symbol (scanner);

g_scanner_peek_next_token (scanner);
}
while (expected_token == G_TOKEN_NONE &&
scanner->next_token != G_TOKEN_EOF &&
scanner->next_token != G_TOKEN_ERROR);

/* give an error message upon syntax errors */
if (expected_token != G_TOKEN_NONE)
g_scanner_unexp_token (scanner, expected_token, NULL, "symbol", NULL, NULL, TRUE);

/* finsish parsing */
g_scanner_destroy (scanner);

/* print results */
g_print ("ping: %f\n", ping);
g_print ("pong: %f\n", pong);
g_print ("zonk: %f\n", zonk);

return 0;
}

this gives:
$ gcc -Wall `glib-config --cflags --libs` gscanner-ex.c && ./a.out
ping: -0.500000
pong: -6.000000
zonk: 0.700000
$

if you change line 6 in the input text to "pong = +6; \n", you get:

$ gcc -Wall `glib-config --cflags --libs` gscanner-ex.c && ./a.out
test text:6: error: unexpected character `+', expected number (float)
ping: 5.000000
pong: 0.000000
zonk: 0.000000
$

since '+'s are not featured by parse_symbol(). since parsing is aborted,
the lines that would assign values to pong and zonk are not evaluated and
thus their values remain 0.

>
> Kind regards
>
> Andreas.
>

---
ciaoTJ

--
To unsubscribe: mail -s unsubscribe [email protected] < /dev/null

Date: Fri, 12 Mar 1999 14:51:44 +0100 (CET)
From: Tim Janik <[email protected]>
To: GTK User-List <[email protected]>
Subject: [gtk-list] Re: g_scanner funtions what for?

On Tue, 9 Mar 1999, Andreas Tille wrote:

> On Fri, 5 Mar 1999, Tim Janik wrote:
>
> > a GScanner will tokenize your text, that is, it'll return an integer
> > for every word or number that appears in its input stream, following
> > certain (customizable) rules to perform this translation.
> > you still need to write the parsing functions on your own though.
> > here's a little test program that will parse
> >
> > <SYMBOL> = <OPTIONAL-MINUS> <NUMBER> ;
> >
> > constructs, while skipping "#\n" and "/**/" style comments.
> > ...
> Thanks for the example. I treid it and it worked so far. Because
> I had to scan also other things than floats I modified your
> example but wasn't successful. I append my code as attachment.
>
> The code fails in scanning gchar * data. Is there any documentation
> to do that right?

your modifications:

/* some test text to be fed into the scanner */
static const gchar *test_text =
( "Datum: 02-16-1999\n"
"Probenname: Hans_1\n"
"Modulator f: 500.0\n"
"Schwingkreis f: 500.0\n"
"Schwingkreis U: 5.0\n"
"Temperatur: 25.0\n"
"Trigger-Delay: -400\n"
"Punktzahl: 8192\n"
"Messzeit: 2.0e-006\n" );

"Schwingkreis f" can't be scanned as a single token, because a GScanner
will skip spaces by default and not feature spaces as a valid symbol char.

when you create a new scanner, g_scanner_new() accepts a pointer to a
GScannerConfig structure that contains various default values for its
scanning behaviour. if you pass that structure as NULL, gscanner.c will
revert to a default structure, defined in gscanner.c:

static GScannerConfig g_scanner_config_template =
{
(
" \t\r\n"
) /* cset_skip_characters */,
(
G_CSET_a_2_z
"_"
G_CSET_A_2_Z
) /* cset_identifier_first */,
(
G_CSET_a_2_z
"_0123456789"
G_CSET_A_2_Z
G_CSET_LATINS
G_CSET_LATINC
) /* cset_identifier_nth */,
[....]

the field cset_skip_characters is setup with " \t\r\n", so that
spaces, tabs newlines and carriage returns will be automatically
skipped by the scanner.

further, "Trigger-Delay" can't be a valid symbol either with the
default configuration. symbols are initially scanned as identifiers, and
eventually get converted to symbols if a lookup in the scanners internal hash
table is successfull. there is no '-' in either the cset_identifier_first
field or cset_identifier_nth contained, so GScanner will not parse identifiers
across '-'es.

to get this to work, you should either rename "Trigger-Delay" and "Schwingkreis f"
to "Trigger_Delay" and "Schwingkreis_f" or, if you want to tweak the default
configuration, "Trigger-Delay" and "Schwingkreis-f", but then you need to
add a '-' to cset_identifier_nth, and you can't parse constructs like

x = 5; y = x-2;

because the '-' would not be returned as a seperate character.

also, you have to adapt parse_symbol() so it parses tokens other than float as
well, instead of:

/* expect a valid symbol */
g_scanner_get_next_token (scanner);
symbol = scanner->token;
if (symbol < SYMBOL_DATE ||
symbol > SYMBOL_T)
return G_TOKEN_SYMBOL;

/* expect '=' */
g_scanner_get_next_token (scanner);
if (scanner->token != ':')
return '=';

/* expect a float (ints are converted to floats on the fly) */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_FLOAT)
return G_TOKEN_FLOAT;

/* assign value, eat the semicolon and exit successfully */
switch (symbol)
{
case SYMBOL_DATE:
date = scanner->value.v_string;
break;
case SYMBOL_F_m:
F_m = scanner->value.v_float;
break;

which will only parse floats and then switch() on the symbols,
you need to do something like:

/* expect a valid symbol */
g_scanner_get_next_token (scanner);
symbol = scanner->token;
if (symbol < SYMBOL_DATE ||
symbol > SYMBOL_T)
return G_TOKEN_SYMBOL;

/* expect '=' */
g_scanner_get_next_token (scanner);
if (scanner->token != ':')
return '=';

/* assign value, eat the semicolon and exit successfully */
switch (symbol)
{
case SYMBOL_DATE:
/* expect a string */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_STRING)
return G_TOKEN_STRING;
date = g_strdup (scanner->value.v_string);
break;
case SYMBOL_F_m:
/* expect a float (ints are converted to floats on the fly) */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_FLOAT)
return G_TOKEN_FLOAT;
F_m = scanner->value.v_float;
break;

to asure you get the correct tokens, matching the symbols. also, if you
retrive strings from the scanner, you have to copy them, as a gscanner
will of course free it's values on the fly again (i.e. when the next value
is put into scanner->value).

>
> Kind regards
>
> Andreas.
>

---
ciaoTJ

Date: Wed, 17 Mar 1999 11:16:07 +0100 (CET)
From: Tim Janik <[email protected]>
To: Andreas Tille <[email protected]>
Cc: GTK User-List <[email protected]>
Subject: [gtk-list] Re: g_scanner funtions what for?

On Tue, 16 Mar 1999, Andreas Tille wrote:

> On Fri, 12 Mar 1999, Tim Janik wrote:
>
> > your modifications:
> >
> > /* some test text to be fed into the scanner */
> > static const gchar *test_text =
> > ( "Datum: 02-16-1999\n"
> > "Probenname: Hans_1\n"
> > "Modulator f: 500.0\n"
> Changed to: Modulator_f
> > "Schwingkreis f: 500.0\n"
> Changed to: Schwingkreis_f
> > "Schwingkreis U: 5.0\n"
> Changed to: Schwingkreis_U
> > "Temperatur: 25.0\n"
> > "Trigger-Delay: -400\n"
> Changed to: Trigger_Delay
> > "Punktzahl: 8192\n"
> > "Messzeit: 2.0e-006\n" );
> >
> > "Schwingkreis f" can't be scanned as a single token, because a GScanner
> > will skip spaces by default and not feature spaces as a valid symbol char.
> >
> > when you create a new scanner, g_scanner_new() accepts a pointer to a
> > GScannerConfig structure that contains various default values for its
> > scanning behaviour. if you pass that structure as NULL, gscanner.c will
> > revert to a default structure, defined in gscanner.c:
> I hope to fit the requirements for the tokens so far. Note that I
> didn't ended the lines with ';' (is this required?).

only if you require such tokens with e.g.

g_scanner_get_next_token (scanner);
if (scanner->token != ';')
return ';';

> > also, you have to adapt parse_symbol() so it parses tokens other than float as
> > well, instead of:
> >
> > /* expect a valid symbol */
> > g_scanner_get_next_token (scanner);
> > symbol = scanner->token;
> > if (symbol < SYMBOL_DATE ||
> > symbol > SYMBOL_T)
> > return G_TOKEN_SYMBOL;
> >
> > ...
> Hmm, I've done that but my program dosn't reach this point. It returns
> with error code for before that stuff.
>
> I tried
>
> /* expect a float (ints are converted to floats on the fly) *or* a string */
> g_scanner_get_next_token (scanner);
> if (scanner->token != G_TOKEN_FLOAT || scanner->token != G_TOKEN_STRING )
> return G_TOKEN_FLOAT;

andreas, look at this more closely, lets s/scanner->token/foo/,
s/G_TOKEN_FLOAT/1/ and s/G_TOKEN_STRING/2/:

if (foo != 1 || foo != 2)
...

literally:
IF foo is not equal to 1 OR foo is not equal to 2

so to get beyond this condition, you require scanner->token to be
==G_TOKEN_FLOAT *and* ==G_TOKEN_STRING at the same time, obviously impossible.

so if at all, you want
if (scanner->token != G_TOKEN_FLOAT && scanner->token != G_TOKEN_STRING)

but i doubt that this is your problem, see below.

> read floats *AND* strings but failed and I get
>
> test text:1: error: unexpected string constant "J02161999", expected number (float)
>
> If I set "Datum: J02161999\n" (the '-' int the former string caused trouble)
> or
>
> test text:1: error: scanner: digit is beyond radix
>
> if I leave "Datum: 02161999\n" ... a string of spaces only. I expect
> that I have to configure the scanner to read strings (like a date or
> a name), but I can't find any way how to do that.

you need to understand that the scanner will parse it's input and
tokenize it, it is up to you to interpret these tokens, not define
their types before they get parsed, e.g. watch gscanner parse a string:

"hi i am 17"
| | | |
| | | v
| | v TOKEN_INT, value: 17
| v TOKEN_IDENTIFIER, value: "am"
v TOKEN_CHAR, value: 'i'
TOKEN_IDENTIFIER, value: "hi"

if you configure the scanner with
scanner->config->int_2_float = TRUE;
scanner->config->char_2_token = TRUE;
scanner->config->scan_symbols = TRUE;

and add "am" as a symbol with
g_scanner_add_symbol (scanner, "am", "symbol value");

gscanner will parse it as

"hi i am 17"
| | | |
| | | v
| | v TOKEN_FLOAT, value: 17.0 (automatic int->float conversion)
| | TOKEN_SYMBOL, value: "symbol value" (a successfull hash table lookup
| | turned a TOKEN_IDENTIFIER into a
| | TOKEN_SYMBOL and took over the
| v symbol's value)
v 'i' ('i' can be a valid token as well, as all chars >0 and <256)
TOKEN_IDENTIFIER, value: "hi"

you need to match the token sequence with your code, and if you encounter
something that you don't want, you error out:

/* expect an identifier ("hi") */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_IDENTIFIER)
return G_TOKEN_IDENTIFIER;
/* expect a token 'i' */
g_scanner_get_next_token (scanner);
if (scanner->token != 'i')
return 'i';
/* expect a symbol ("am") */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_SYMBOL)
return G_TOKEN_SYMBOL;
/* expect a float (17.0) */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_FLOAT)
return G_TOKEN_FLOAT;

if you got past here, you have parsed "hi i am 17" and would have
accepted "dooh i am 42" and "bah i am 0.75" as well, but you would
have not accepted "hi 7 am 17" or "hi i hi 17".

>
> Kind regards

hope, this helps.

>
> Andreas.
>

---
ciaoTJ

猜你喜欢