openqasm/source/grammar/qasm3Lexer.g4

264 lines
8.8 KiB
ANTLR

lexer grammar qasm3Lexer;
/* Naming conventions in this lexer grammar
*
* - Keywords and exact symbols that have only one possible value are written in
* all caps. There is no more information in the parsed text than in the name
* of the lexeme. For example, `INCLUDE` is only ever the string `'include'`.
*
* - Lexemes with information in the string form are in PascalCase. This
* indicates there is more information in the token than just the name. For
* example, `Identifier` has a payload containing the name of the identifier.
*/
/* Language keywords. */
OPENQASM: 'OPENQASM' -> pushMode(VERSION_IDENTIFIER);
INCLUDE: 'include' -> pushMode(ARBITRARY_STRING);
DEFCALGRAMMAR: 'defcalgrammar' -> pushMode(ARBITRARY_STRING);
DEF: 'def';
CAL: 'cal' -> mode(CAL_PRELUDE);
DEFCAL: 'defcal' -> mode(DEFCAL_PRELUDE);
GATE: 'gate';
EXTERN: 'extern';
BOX: 'box';
LET: 'let';
BREAK: 'break';
CONTINUE: 'continue';
IF: 'if';
ELSE: 'else';
END: 'end';
RETURN: 'return';
FOR: 'for';
WHILE: 'while';
IN: 'in';
SWITCH: 'switch';
CASE: 'case';
DEFAULT: 'default';
PRAGMA: '#'? 'pragma' -> pushMode(EAT_TO_LINE_END);
AnnotationKeyword: '@' Identifier ('.' Identifier)* -> pushMode(EAT_TO_LINE_END);
/* Types. */
INPUT: 'input';
OUTPUT: 'output';
CONST: 'const';
READONLY: 'readonly';
MUTABLE: 'mutable';
QREG: 'qreg';
QUBIT: 'qubit';
CREG: 'creg';
BOOL: 'bool';
BIT: 'bit';
INT: 'int';
UINT: 'uint';
FLOAT: 'float';
ANGLE: 'angle';
COMPLEX: 'complex';
ARRAY: 'array';
VOID: 'void';
DURATION: 'duration';
STRETCH: 'stretch';
/* Builtin identifiers and operations */
GPHASE: 'gphase';
INV: 'inv';
POW: 'pow';
CTRL: 'ctrl';
NEGCTRL: 'negctrl';
DIM: '#dim';
DURATIONOF: 'durationof';
DELAY: 'delay';
RESET: 'reset';
MEASURE: 'measure';
BARRIER: 'barrier';
BooleanLiteral: 'true' | 'false';
/* Symbols */
LBRACKET: '[';
RBRACKET: ']';
LBRACE: '{';
RBRACE: '}';
LPAREN: '(';
RPAREN: ')';
COLON: ':';
SEMICOLON: ';';
DOT: '.';
COMMA: ',';
EQUALS: '=';
ARROW: '->';
PLUS: '+';
DOUBLE_PLUS: '++';
MINUS: '-';
ASTERISK: '*';
DOUBLE_ASTERISK: '**';
SLASH: '/';
PERCENT: '%';
PIPE: '|';
DOUBLE_PIPE: '||';
AMPERSAND: '&';
DOUBLE_AMPERSAND: '&&';
CARET: '^';
AT: '@';
TILDE: '~';
EXCLAMATION_POINT: '!';
EqualityOperator: '==' | '!=';
CompoundAssignmentOperator: '+=' | '-=' | '*=' | '/=' | '&=' | '|=' | '~=' | '^=' | '<<=' | '>>=' | '%=' | '**=';
ComparisonOperator: '>' | '<' | '>=' | '<=';
BitshiftOperator: '>>' | '<<';
IMAG: 'im';
ImaginaryLiteral: (DecimalIntegerLiteral | FloatLiteral) [ \t]* IMAG;
BinaryIntegerLiteral: ('0b' | '0B') ([01] '_'?)* [01];
OctalIntegerLiteral: '0o' ([0-7] '_'?)* [0-7];
DecimalIntegerLiteral: ([0-9] '_'?)* [0-9];
HexIntegerLiteral: ('0x' | '0X') ([0-9a-fA-F] '_'?)* [0-9a-fA-F];
fragment ValidUnicode: [\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]; // valid unicode chars
fragment Letter: [A-Za-z];
fragment FirstIdCharacter: '_' | ValidUnicode | Letter;
fragment GeneralIdCharacter: FirstIdCharacter | [0-9];
Identifier: FirstIdCharacter GeneralIdCharacter*;
HardwareQubit: '$' [0-9]+;
fragment FloatLiteralExponent: [eE] (PLUS | MINUS)? DecimalIntegerLiteral;
FloatLiteral:
// 1_123e-3, 123e+4 or 123E5 (needs the exponent or it's just an integer)
DecimalIntegerLiteral FloatLiteralExponent
// .1234_5678 or .1e3 (no digits before the dot)
| DOT DecimalIntegerLiteral FloatLiteralExponent?
// 123.456, 123. or 145.32e+1_00
| DecimalIntegerLiteral DOT DecimalIntegerLiteral? FloatLiteralExponent?;
fragment TimeUnit: 'dt' | 'ns' | 'us' | 'µs' | 'ms' | 's';
// represents explicit time value in SI or backend units
TimingLiteral: (DecimalIntegerLiteral | FloatLiteral) [ \t]* TimeUnit;
BitstringLiteral: '"' ([01] '_'?)* [01] '"';
// Ignore whitespace between tokens, and define C++-style comments.
Whitespace: [ \t]+ -> skip ;
Newline: [\r\n]+ -> skip ;
LineComment : '//' ~[\r\n]* -> skip;
BlockComment : '/*' .*? '*/' -> skip;
// The version identifier token would be ambiguous between itself and
// integer/floating-point literals, so we use a special mode to ensure it's
// lexed correctly.
mode VERSION_IDENTIFIER;
VERSION_IDENTIFER_WHITESPACE: [ \t\r\n]+ -> skip;
VersionSpecifier: [0-9]+ ('.' [0-9]+)? -> popMode;
// An include statement's path or defcalgrammar target is potentially ambiguous
// with `BitstringLiteral`.
mode ARBITRARY_STRING;
ARBITRARY_STRING_WHITESPACE: [ \t\r\n]+ -> skip;
// allow ``"str"`` and ``'str'``;
StringLiteral: ('"' ~["\r\t\n]+? '"' | '\'' ~['\r\t\n]+? '\'') -> popMode;
// A different lexer mode to swap to when we need handle tokens on a line basis
// rather than the default arbitrary-whitespace-based tokenisation. This is
// used by the annotation and pragma rules.
mode EAT_TO_LINE_END;
EAT_INITIAL_SPACE: [ \t]+ -> skip;
EAT_LINE_END: [\r\n] -> popMode, skip;
// The line content must be a non-empty token to satisfy ANTLR (otherwise it
// would be able to produce an infinite number of tokens). We could include
// the line ending to guarantee that this is always non-empty, but that just
// puts an annoying burden on consumers to remove it again.
RemainingLineContent: ~[ \t\r\n] ~[\r\n]*;
// We need to do a little context-aware lexing when we hit a `cal` or `defcal`
// token. In both cases, there's a small interlude before the pulse grammar
// block starts, and we need to be able to lex our way through that. We don't
// want to tie this grammar to one host language by injecting host code to
// manage the state of the lexer, so instead we need to do a little duplication
// of the tokens, because ANTLR doesn't allow us to inherit rules directly.
mode CAL_PRELUDE;
CAL_PRELUDE_WHITESPACE: [ \t\r\n]+ -> skip;
CAL_PRELUDE_COMMENT: (LineComment | BlockComment) -> skip;
CAL_PRELUDE_LBRACE: LBRACE -> type(LBRACE), mode(CAL_BLOCK);
mode DEFCAL_PRELUDE;
DEFCAL_PRELUDE_WHITESPACE: [ \t\r\n]+ -> skip;
DEFCAL_PRELUDE_COMMENT: (LineComment | BlockComment) -> skip;
DEFCAL_PRELUDE_LBRACE: LBRACE -> type(LBRACE), mode(CAL_BLOCK);
// Duplications of valid constant expression tokens that may appear in the
// argument list. This is an unfortunately large number of duplications.
// Types.
DEFCAL_PRELUDE_QREG: QREG -> type(QREG);
DEFCAL_PRELUDE_QUBIT: QUBIT -> type(QUBIT);
DEFCAL_PRELUDE_CREG: CREG -> type(CREG);
DEFCAL_PRELUDE_BOOL: BOOL -> type(BOOL);
DEFCAL_PRELUDE_BIT: BIT -> type(BIT);
DEFCAL_PRELUDE_INT: INT -> type(INT);
DEFCAL_PRELUDE_UINT: UINT -> type(UINT);
DEFCAL_PRELUDE_ANGLE: ANGLE -> type(ANGLE);
DEFCAL_PRELUDE_FLOAT: FLOAT -> type(FLOAT);
DEFCAL_PRELUDE_COMPLEX: COMPLEX -> type(COMPLEX);
DEFCAL_PRELUDE_ARRAY: ARRAY -> type(ARRAY);
DEFCAL_PRELUDE_DURATION: DURATION -> type(DURATION);
// Punctuation.
DEFCAL_PRELUDE_LBRACKET: LBRACKET -> type(LBRACKET);
DEFCAL_PRELUDE_RBRACKET: RBRACKET -> type(RBRACKET);
DEFCAL_PRELUDE_LPAREN: LPAREN -> type(LPAREN);
DEFCAL_PRELUDE_RPAREN: RPAREN -> type(RPAREN);
DEFCAL_PRELUDE_ARROW: ARROW -> type(ARROW);
DEFCAL_PRELUDE_COMMA: COMMA -> type(COMMA);
DEFCAL_PRELUDE_PLUS: PLUS -> type(PLUS);
DEFCAL_PRELUDE_MINUS: MINUS -> type(MINUS);
DEFCAL_PRELUDE_ASTERISK: ASTERISK -> type(ASTERISK);
DEFCAL_PRELUDE_SLASH: SLASH -> type(SLASH);
DEFCAL_PRELUDE_BitshiftOperator: BitshiftOperator -> type(BitshiftOperator);
// Literals and names.
DEFCAL_PRELUDE_BitstringLiteral: BitstringLiteral -> type(BitstringLiteral);
DEFCAL_PRELUDE_BinaryIntegerLiteral: BinaryIntegerLiteral -> type(BinaryIntegerLiteral);
DEFCAL_PRELUDE_OctalIntegerLiteral: OctalIntegerLiteral -> type(OctalIntegerLiteral);
DEFCAL_PRELUDE_DecimalIntegerLiteral: DecimalIntegerLiteral -> type(DecimalIntegerLiteral);
DEFCAL_PRELUDE_HexIntegerLiteral: HexIntegerLiteral -> type(HexIntegerLiteral);
DEFCAL_PRELUDE_FloatLiteral: FloatLiteral -> type(FloatLiteral);
DEFCAL_PRELUDE_MEASURE: MEASURE -> type(MEASURE);
DEFCAL_PRELUDE_DELAY: DELAY -> type(DELAY);
DEFCAL_PRELUDE_RESET: RESET -> type(RESET);
DEFCAL_PRELUDE_Identifier: Identifier -> type(Identifier);
DEFCAL_PRELUDE_HardwareQubit: HardwareQubit -> type(HardwareQubit);
// The meat-and-potatoes of matching a calibration block with balanced inner
// braces. We enter `CAL_BLOCK` with the opening brace already tokenised
// (that's how the lexer knew to swap modes to us), and with the token left open
// to continue to accumulate. We want to tokenise until we hit the balancing
// brace. Since we have _no_ knowledge of what the inner langauge is doing,
// things like unbalanced braces in comments will cause a failure, but there's
// not much we can do about that without greater spec restrictions.
mode CAL_BLOCK;
fragment NestedCalibrationBlock: LBRACE (NestedCalibrationBlock | ~[{}])* RBRACE;
CalibrationBlock: (NestedCalibrationBlock | ~[{}])+;
CAL_BLOCK_RBRACE: RBRACE -> type(RBRACE), mode(DEFAULT_MODE);