pokemon-showdown-client/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php
2023-06-16 11:33:53 -07:00

711 lines
20 KiB
PHP

<?php
/**
* @file
* @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
*/
namespace Wikimedia\CSS\Parser;
use Wikimedia\CSS\Objects\AtRule;
use Wikimedia\CSS\Objects\ComponentValue;
use Wikimedia\CSS\Objects\ComponentValueList;
use Wikimedia\CSS\Objects\CSSFunction;
use Wikimedia\CSS\Objects\Declaration;
use Wikimedia\CSS\Objects\DeclarationList;
use Wikimedia\CSS\Objects\DeclarationOrAtRuleList;
use Wikimedia\CSS\Objects\QualifiedRule;
use Wikimedia\CSS\Objects\Rule;
use Wikimedia\CSS\Objects\RuleList;
use Wikimedia\CSS\Objects\SimpleBlock;
use Wikimedia\CSS\Objects\Stylesheet;
use Wikimedia\CSS\Objects\Token;
// Note: While reading the code below, you might find that my calls to
// consumeToken() don't match what the spec says, and I don't ever "reconsume" a
// token. It turns out that the spec is overcomplicated and confused with
// respect to the "current input token" and the "next input token". It turns
// out things are pretty simple: every "consume an X" is called with the
// current input token being the first token of X, and returns with the current
// input token being the last token of X (or EOF if X ends at EOF).
// Also, of note is that, since our Tokenizer can only return a stream of tokens
// rather than a stream of component values, the consume functions here only
// consider tokens. ComponentValueList::toTokenArray() may be used to convert a
// list of component values to a list of tokens if necessary.
/**
* Parse CSS into a structure for further processing.
*
* This implements the CSS Syntax Module Level 3 candidate recommendation.
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/
*
* The usual entry points are:
* - Parser::parseStylesheet() to parse a stylesheet or the contents of a <style> tag.
* - Parser::parseDeclarationList() to parse an inline style attribute
*/
class Parser {
/**
* Maximum depth of nested ComponentValues
*
* Arbitrary number that seems like it should be enough
*/
private const CV_DEPTH_LIMIT = 100;
/** @var Tokenizer */
protected $tokenizer;
/** @var Token|null The most recently consumed token */
protected $currentToken = null;
/** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */
protected $parseErrors = [];
/** @var int Recursion depth, incremented in self::consumeComponentValue() */
protected $cvDepth = 0;
/**
* @param Tokenizer $tokenizer CSS Tokenizer
*/
public function __construct( Tokenizer $tokenizer ) {
$this->tokenizer = $tokenizer;
}
/**
* Create a Parser for a CSS string
* @param string $source CSS to parse.
* @param array $options Configuration options, see DataSourceTokenizer::__construct(). Also,
* - convert: (array) If specified, detect the encoding as defined in the
* CSS spec. The value is passed as the $encodings argument to
* Encoder::convert().
* @return static
*/
public static function newFromString( $source, array $options = [] ) {
if ( isset( $options['convert'] ) ) {
$source = Encoder::convert( $source, $options['convert'] );
}
return static::newFromDataSource( new StringDataSource( $source ), $options );
}
/**
* Create a Parser for a CSS DataSource
* @param DataSource $source CSS to parse.
* @param array $options Configuration options, see DataSourceTokenizer::__construct().
* @return static
*/
public static function newFromDataSource( DataSource $source, array $options = [] ) {
$tokenizer = new DataSourceTokenizer( $source, $options );
return new static( $tokenizer );
}
/**
* Create a Parser for a list of Tokens
* @param Token[] $tokens Token-stream to parse
* @param Token|null $eof EOF-token
* @return static
*/
public static function newFromTokens( array $tokens, Token $eof = null ) {
$tokenizer = new TokenListTokenizer( $tokens, $eof );
return new static( $tokenizer );
}
/**
* Consume a token
*/
protected function consumeToken() {
if ( !$this->currentToken || $this->currentToken->type() !== Token::T_EOF ) {
$this->currentToken = $this->tokenizer->consumeToken();
// Copy any parse errors encountered
foreach ( $this->tokenizer->getParseErrors() as $error ) {
$this->parseErrors[] = $error;
}
$this->tokenizer->clearParseErrors();
}
}
/**
* Consume a token, also consuming any following whitespace (and comments)
*/
protected function consumeTokenAndWhitespace() {
do {
$this->consumeToken();
} while ( $this->currentToken->type() === Token::T_WHITESPACE );
}
/**
* Return all parse errors seen so far
* @return array Array of [ string $tag, int $line, int $pos, ... ]
*/
public function getParseErrors() {
return $this->parseErrors;
}
/**
* Clear parse errors
*/
public function clearParseErrors() {
$this->parseErrors = [];
}
/**
* Record a parse error
* @param string $tag Error tag
* @param Token $token Report the error as starting at this token.
* @param array $data Extra data about the error.
*/
protected function parseError( $tag, Token $token, array $data = [] ) {
[ $line, $pos ] = $token->getPosition();
$this->parseErrors[] = array_merge( [ $tag, $line, $pos ], $data );
}
/**
* Parse a stylesheet
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-stylesheet
* @return Stylesheet
*/
public function parseStylesheet() {
// Move to the first token
$this->consumeToken();
$list = $this->consumeRuleList( true );
return new Stylesheet( $list );
}
/**
* Parse a list of rules
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-list-of-rules
* @return RuleList
*/
public function parseRuleList() {
// Move to the first token
$this->consumeToken();
return $this->consumeRuleList( false );
}
/**
* Parse a rule
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-rule
* @return Rule|null
*/
public function parseRule() {
// 1.
$this->consumeTokenAndWhitespace();
// 2.
if ( $this->currentToken->type() === Token::T_EOF ) {
// "return a syntax error"?
$this->parseError( 'unexpected-eof', $this->currentToken );
return null;
}
if ( $this->currentToken->type() === Token::T_AT_KEYWORD ) {
$rule = $this->consumeAtRule();
} else {
$rule = $this->consumeQualifiedRule();
if ( !$rule ) {
return null;
}
}
// 3.
$this->consumeTokenAndWhitespace();
// 4.
if ( $this->currentToken->type() === Token::T_EOF ) {
return $rule;
}
// "return a syntax error"?
$this->parseError( 'expected-eof', $this->currentToken );
return null;
}
/**
* Parse a declaration
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-declaration
* @return Declaration|null
*/
public function parseDeclaration() {
// 1.
$this->consumeTokenAndWhitespace();
// 2.
if ( $this->currentToken->type() !== Token::T_IDENT ) {
// "return a syntax error"?
$this->parseError( 'expected-ident', $this->currentToken );
return null;
}
// 3.
// Declarations always run to EOF, no need to check.
return $this->consumeDeclaration();
}
/**
* Parse a list of declarations
* @note This is not the entry point the standard calls "parse a list of declarations",
* see self::parseDeclarationOrAtRuleList()
* @return DeclarationList
*/
public function parseDeclarationList() {
// Move to the first token
$this->consumeToken();
return $this->consumeDeclarationOrAtRuleList( false );
}
/**
* Parse a list of declarations and at-rules
* @note This is the entry point the standard calls "parse a list of declarations"
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-list-of-declarations
* @return DeclarationOrAtRuleList
*/
public function parseDeclarationOrAtRuleList() {
// Move to the first token
$this->consumeToken();
return $this->consumeDeclarationOrAtRuleList();
}
/**
* Parse a (non-whitespace) component value
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-component-value
* @return ComponentValue|null
*/
public function parseComponentValue() {
// 1.
$this->consumeTokenAndWhitespace();
// 2.
if ( $this->currentToken->type() === Token::T_EOF ) {
// "return a syntax error"?
$this->parseError( 'unexpected-eof', $this->currentToken );
return null;
}
// 3.
$value = $this->consumeComponentValue();
// 4.
$this->consumeTokenAndWhitespace();
// 5.
if ( $this->currentToken->type() === Token::T_EOF ) {
return $value;
}
// "return a syntax error"?
$this->parseError( 'expected-eof', $this->currentToken );
return null;
}
/**
* Parse a list of component values
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-list-of-component-values
* @return ComponentValueList
*/
public function parseComponentValueList() {
$list = new ComponentValueList();
while ( true ) {
// Move to the first/next token
$this->consumeToken();
$value = $this->consumeComponentValue();
if ( $value instanceof Token && $value->type() === Token::T_EOF ) {
break;
}
$list->add( $value );
}
return $list;
}
/**
* Parse a comma-separated list of component values
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#parse-comma-separated-list-of-component-values
* @return ComponentValueList[]
*/
public function parseCommaSeparatedComponentValueList() {
$lists = [];
do {
$list = new ComponentValueList();
while ( true ) {
// Move to the first/next token
$this->consumeToken();
$value = $this->consumeComponentValue();
if ( $value instanceof Token &&
( $value->type() === Token::T_EOF || $value->type() === Token::T_COMMA )
) {
break;
}
$list->add( $value );
}
$lists[] = $list;
} while ( $value->type() === Token::T_COMMA );
return $lists;
}
/**
* Consume a list of rules
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-list-of-rules
* @param bool $topLevel Determines the behavior when CDO and CDC tokens are encountered
* @return RuleList
*/
protected function consumeRuleList( $topLevel ) {
// @phan-suppress-previous-line PhanPluginNeverReturnMethod
$list = new RuleList();
// @phan-suppress-next-line PhanInfiniteLoop
while ( true ) {
$rule = false;
switch ( $this->currentToken->type() ) {
case Token::T_WHITESPACE:
break;
case Token::T_EOF:
break 2;
case Token::T_CDO:
case Token::T_CDC:
if ( !$topLevel ) {
$rule = $this->consumeQualifiedRule();
}
// Else, do nothing
break;
case Token::T_AT_KEYWORD:
$rule = $this->consumeAtRule();
break;
default:
$rule = $this->consumeQualifiedRule();
break;
}
if ( $rule ) {
$list->add( $rule );
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2
return $list;
}
/**
* Consume a list of declarations and at-rules
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-list-of-declarations
* @param bool $allowAtRules Whether to allow at-rules. This flag is not in
* the spec and is used to implement the non-spec self::parseDeclarationList().
* @return DeclarationOrAtRuleList|DeclarationList
*/
protected function consumeDeclarationOrAtRuleList( $allowAtRules = true ) {
// @phan-suppress-previous-line PhanPluginNeverReturnMethod
$list = $allowAtRules ? new DeclarationOrAtRuleList() : new DeclarationList();
// @phan-suppress-next-line PhanInfiniteLoop
while ( true ) {
$declaration = false;
switch ( $this->currentToken->type() ) {
case Token::T_WHITESPACE:
break;
case Token::T_SEMICOLON:
$declaration = null;
break;
case Token::T_EOF:
break 2;
case Token::T_AT_KEYWORD:
if ( $allowAtRules ) {
$declaration = $this->consumeAtRule();
} else {
$this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
$this->consumeAtRule();
$declaration = null;
}
break;
case Token::T_IDENT:
$cvs = [];
do {
$cvs[] = $this->consumeComponentValue();
$this->consumeToken();
} while (
$this->currentToken->type() !== Token::T_SEMICOLON &&
$this->currentToken->type() !== Token::T_EOF
);
$tokens = ( new ComponentValueList( $cvs ) )->toTokenArray();
$parser = static::newFromTokens( $tokens, $this->currentToken );
// Load that first token
$parser->consumeToken();
$declaration = $parser->consumeDeclaration();
// Propagate any errors
$this->parseErrors = array_merge( $this->parseErrors, $parser->parseErrors );
break;
default:
$this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
do {
$this->consumeComponentValue();
$this->consumeToken();
} while (
$this->currentToken->type() !== Token::T_SEMICOLON &&
$this->currentToken->type() !== Token::T_EOF
);
$declaration = null;
break;
}
if ( $declaration ) {
$list->add( $declaration );
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2
return $list;
}
/**
* Consume a declaration
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-declaration
* @return Declaration|null
*/
protected function consumeDeclaration() {
$declaration = new Declaration( $this->currentToken );
// 1.
$this->consumeTokenAndWhitespace();
// 2. and 3.
if ( $this->currentToken->type() !== Token::T_COLON ) {
$this->parseError( 'expected-colon', $this->currentToken );
return null;
}
$this->consumeTokenAndWhitespace();
// 4.
$value = $declaration->getValue();
$l1 = $l2 = -1;
while ( $this->currentToken->type() !== Token::T_EOF ) {
$value->add( $this->consumeComponentValue() );
if ( $this->currentToken->type() !== Token::T_WHITESPACE ) {
$l1 = $l2;
$l2 = $value->count() - 1;
}
$this->consumeToken();
}
// 5. and part of 6.
// @phan-suppress-next-line PhanSuspiciousValueComparison False positive about $l1 is -1
$v1 = $l1 >= 0 ? $value[$l1] : null;
$v2 = $l2 >= 0 ? $value[$l2] : null;
if ( $v1 instanceof Token &&
$v1->type() === Token::T_DELIM &&
$v1->value() === '!' &&
$v2 instanceof Token &&
$v2->type() === Token::T_IDENT &&
!strcasecmp( $v2->value(), 'important' )
) {
// This removes the "!" and "important" (5), and also any whitespace between/after (6)
while ( isset( $value[$l1] ) ) {
$value->remove( $l1 );
}
$declaration->setImportant( true );
}
// Rest of 6.
$i = $value->count();
// @phan-suppress-next-line PhanNonClassMethodCall False positive
while ( --$i >= 0 && $value[$i] instanceof Token && $value[$i]->type() === Token::T_WHITESPACE ) {
$value->remove( $i );
}
// 7.
return $declaration;
}
/**
* Consume an at-rule
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-at-rule
* @return AtRule
* @suppress PhanPluginNeverReturnMethod due to break 2;
*/
protected function consumeAtRule() {
$rule = new AtRule( $this->currentToken );
$this->consumeToken();
// @phan-suppress-next-line PhanInfiniteLoop
while ( true ) {
switch ( $this->currentToken->type() ) {
case Token::T_SEMICOLON:
break 2;
case Token::T_EOF:
if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
$this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
}
break 2;
case Token::T_LEFT_BRACE:
$rule->setBlock( $this->consumeSimpleBlock() );
break 2;
// Spec has "simple block with an associated token of <{-token>" here, but that isn't possible
// because it's not a Token.
default:
$rule->getPrelude()->add( $this->consumeComponentValue() );
break;
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode False positive due to break 2;
return $rule;
}
/**
* Consume a qualified rule
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-qualified-rule
* @return QualifiedRule|null
*/
protected function consumeQualifiedRule() {
$rule = new QualifiedRule( $this->currentToken );
while ( true ) {
switch ( $this->currentToken->type() ) {
case Token::T_EOF:
if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
$this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
}
return null;
case Token::T_LEFT_BRACE:
$rule->setBlock( $this->consumeSimpleBlock() );
break 2;
// Spec has "simple block with an associated token of <{-token>" here, but that isn't possible
// because it's not a Token.
default:
$rule->getPrelude()->add( $this->consumeComponentValue() );
break;
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode False positive due to break 2;
return $rule;
}
/**
* Consume a component value
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-component-value
* @return ComponentValue
*/
protected function consumeComponentValue() {
if ( ++$this->cvDepth > static::CV_DEPTH_LIMIT ) {
$this->parseError( 'recursion-depth-exceeded', $this->currentToken );
// There's no way to safely recover from this without more recursion.
// So just eat the rest of the input, then return a
// specially-flagged EOF, so we can avoid 100 "unexpected EOF"
// errors.
$position = $this->currentToken->getPosition();
while ( $this->currentToken->type() !== Token::T_EOF ) {
$this->consumeToken();
}
$this->currentToken = new Token( Token::T_EOF, [
'position' => $position,
'typeFlag' => 'recursion-depth-exceeded'
] );
}
switch ( $this->currentToken->type() ) {
case Token::T_LEFT_BRACE:
case Token::T_LEFT_BRACKET:
case Token::T_LEFT_PAREN:
$ret = $this->consumeSimpleBlock();
break;
case Token::T_FUNCTION:
$ret = $this->consumeFunction();
break;
default:
$ret = $this->currentToken;
break;
}
$this->cvDepth--;
// @phan-suppress-next-line PhanTypeMismatchReturnNullable $ret always set
return $ret;
}
/**
* Consume a simple block
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-simple-block
* @return SimpleBlock
* @suppress PhanPluginNeverReturnMethod due to break 2;
*/
protected function consumeSimpleBlock() {
$block = new SimpleBlock( $this->currentToken );
$endTokenType = $block->getEndTokenType();
$this->consumeToken();
// @phan-suppress-next-line PhanInfiniteLoop
while ( true ) {
switch ( $this->currentToken->type() ) {
case Token::T_EOF:
if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
$this->parseError( 'unexpected-eof-in-block', $this->currentToken );
}
break 2;
case $endTokenType:
break 2;
default:
$block->getValue()->add( $this->consumeComponentValue() );
break;
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode False positive due to break 2;
return $block;
}
/**
* Consume a function
* @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-function
* @return CSSFunction
* @suppress PhanPluginNeverReturnMethod due to break 2;
*/
protected function consumeFunction() {
$function = new CSSFunction( $this->currentToken );
$this->consumeToken();
// @phan-suppress-next-line PhanInfiniteLoop
while ( true ) {
switch ( $this->currentToken->type() ) {
case Token::T_EOF:
if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
$this->parseError( 'unexpected-eof-in-function', $this->currentToken );
}
break 2;
case Token::T_RIGHT_PAREN:
break 2;
default:
$function->getValue()->add( $this->consumeComponentValue() );
break;
}
$this->consumeToken();
}
// @phan-suppress-next-line PhanPluginUnreachableCode False positive due to break 2;
return $function;
}
// @codeCoverageIgnoreEnd
}