pokemon-showdown-client/lib/css-sanitizer/Wikimedia/CSS/Parser/Parser.php

<?php
/**
 * @file
 * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
 */

namespace Wikimedia\CSS\Parser;

use Wikimedia\CSS\Objects\AtRule;
use Wikimedia\CSS\Objects\ComponentValueList;
use Wikimedia\CSS\Objects\ComponentValue;
use Wikimedia\CSS\Objects\CSSFunction;
use Wikimedia\CSS\Objects\DeclarationList;
use Wikimedia\CSS\Objects\DeclarationOrAtRuleList;
use Wikimedia\CSS\Objects\Declaration;
use Wikimedia\CSS\Objects\QualifiedRule;
use Wikimedia\CSS\Objects\Rule;
use Wikimedia\CSS\Objects\RuleList;
use Wikimedia\CSS\Objects\SimpleBlock;
use Wikimedia\CSS\Objects\Stylesheet;
use Wikimedia\CSS\Objects\Token;
use Wikimedia\CSS\Sanitizer\Sanitizer;

// Note: While reading the code below, you might find that my calls to
// consumeToken() don't match what the spec says and I don't ever "reconsume" a
// token. It turns out that the spec is overcomplicated and confused with
// respect to the "current input token" and the "next input token". It turns
// out things are pretty simple: every "consume an X" is called with the
// current input token being the first token of X, and returns with the current
// input token being the last token of X (or EOF if X ends at EOF).

// Also of note is that, since our Tokenizer can only return a stream of tokens
// rather than a stream of component values, the consume functions here only
// consider tokens. ComponentValueList::toTokenArray() may be used to convert a
// list of component values to a list of tokens if necessary.

/**
 * Parse CSS into a structure for further processing.
 *
 * This implements the CSS Syntax Module Level 3 candidate recommendation.
 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/
 *
 * The usual entry points are:
 *  - Parser::parseStylesheet() to parse a stylesheet or the contents of a <style> tag.
 *  - Parser::parseDeclarationList() to parse an inline style attribute
 */
class Parser {
	/** Maximum depth of nested ComponentValues */
	const CV_DEPTH_LIMIT = 100; // Arbitrary number that seems like it should be enough

	/** @var Tokenizer */
	protected $tokenizer;

	/** @var Token|null The most recently consumed token */
	protected $currentToken = null;

	/** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */
	protected $parseErrors = [];

	/** @var int Recursion depth, incremented in self::consumeComponentValue() */
	protected $cvDepth = 0;

	/**
	 * @param Tokenizer $tokenizer CSS Tokenizer
	 */
	public function __construct( Tokenizer $tokenizer ) {
		$this->tokenizer = $tokenizer;
	}

	/**
	 * Create a Parser for a CSS string
	 * @param string $source CSS to parse.
	 * @param array $options Configuration options, see DataSourceTokenizer::__construct(). Also,
	 *  - convert: (array) If specified, detect the encoding as defined in the
	 *    CSS spec. The value is passed as the $encodings argument to
	 *    Encoder::convert().
	 * @return static
	 */
	public static function newFromString( $source, array $options = [] ) {
		if ( isset( $options['convert'] ) ) {
			$source = Encoder::convert( $source, $options['convert'] );
		}
		return static::newFromDataSource( new StringDataSource( $source ), $options );
	}

	/**
	 * Create a Parser for a CSS DataSource
	 * @param DataSource $source CSS to parse.
	 * @param array $options Configuration options, see DataSourceTokenizer::__construct().
	 * @return static
	 */
	public static function newFromDataSource( DataSource $source, array $options = [] ) {
		$tokenizer = new DataSourceTokenizer( $source, $options );
		return new static( $tokenizer );
	}

	/**
	 * Create a Parser for a list of Tokens
	 * @param Token[] $tokens Token-stream to parse
	 * @param Token|null $eof EOF-token
	 * @return static
	 */
	public static function newFromTokens( array $tokens, Token $eof = null ) {
		$tokenizer = new TokenListTokenizer( $tokens, $eof );
		return new static( $tokenizer );
	}

	/**
	 * Consume a token
	 */
	protected function consumeToken() {
		if ( !$this->currentToken || $this->currentToken->type() !== Token::T_EOF ) {
			$this->currentToken = $this->tokenizer->consumeToken();

			// Copy any parse errors encountered
			foreach ( $this->tokenizer->getParseErrors() as $error ) {
				$this->parseErrors[] = $error;
			}
			$this->tokenizer->clearParseErrors();
		}
	}

	/**
	 * Consume a token, also consuming any following whitespace (and comments)
	 */
	protected function consumeTokenAndWhitespace() {
		do {
			$this->consumeToken();
		} while ( $this->currentToken->type() === Token::T_WHITESPACE );
	}

	/**
	 * Return all parse errors seen so far
	 * @return array Array of [ string $tag, int $line, int $pos, ... ]
	 */
	public function getParseErrors() {
		return $this->parseErrors;
	}

	/**
	 * Clear parse errors
	 */
	public function clearParseErrors() {
		$this->parseErrors = [];
	}

	/**
	 * Record a parse error
	 * @param string $tag Error tag
	 * @param Token $token Report the error as starting at this token.
	 * @param array $data Extra data about the error.
	 */
	protected function parseError( $tag, Token $token, array $data = [] ) {
		list( $line, $pos ) = $token->getPosition();
		$this->parseErrors[] = array_merge( [ $tag, $line, $pos ], $data );
	}

	/**
	 * Parse a stylesheet
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-stylesheet
	 * @note Per the Editor's Draft, if the first rule is an at-rule named
	 *  "charset" it will be silently dropped. If you're not using the provided
	 *  Sanitizer classes to further sanitize the CSS, you'll want to manually
	 *  filter out any other such rules before stringifying the stylesheet
	 *  and/or prepend `@charset "utf-8";` after stringifying it.
	 * @return Stylesheet
	 */
	public function parseStylesheet() {
		$this->consumeToken(); // Move to the first token
		$list = $this->consumeRuleList( true );

		// Drop @charset per the Editor's Draft
		if ( isset( $list[0] ) && $list[0] instanceof AtRule &&
			!strcasecmp( $list[0]->getName(), 'charset' )
		) {
			$list->remove( 0 );
			$list->rewind();
		}

		return new Stylesheet( $list );
	}

	/**
	 * Parse a list of rules
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-rules
	 * @return RuleList
	 */
	public function parseRuleList() {
		$this->consumeToken(); // Move to the first token
		return $this->consumeRuleList( false );
	}

	/**
	 * Parse a rule
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-rule
	 * @return Rule|null
	 */
	public function parseRule() {
		// 1. and 2.
		$this->consumeTokenAndWhitespace();

		// 3.
		if ( $this->currentToken->type() === Token::T_EOF ) {
			$this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"?
			return null;
		}

		if ( $this->currentToken->type() === Token::T_AT_KEYWORD ) {
			$rule = $this->consumeAtRule();
		} else {
			$rule = $this->consumeQualifiedRule();
			if ( !$rule ) {
				return null;
			}
		}

		// 4.
		$this->consumeTokenAndWhitespace();

		// 5.
		if ( $this->currentToken->type() === Token::T_EOF ) {
			return $rule;
		} else {
			$this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"?
			return null;
		}
	}

	/**
	 * Parse a declaration
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-declaration
	 * @return Declaration|null
	 */
	public function parseDeclaration() {
		// 1. and 2.
		$this->consumeTokenAndWhitespace();

		// 3.
		if ( $this->currentToken->type() !== Token::T_IDENT ) {
			$this->parseError( 'expected-ident', $this->currentToken ); // "return a syntax error"?
			return null;
		}

		// 4.
		$declaration = $this->consumeDeclaration();

		// Declarations always run to EOF, no need to check.

		return $declaration;
	}

	/**
	 * Parse a list of declarations
	 * @note This is not the entry point the standard calls "parse a list of declarations",
	 *  see self::parseDeclarationOrAtRuleList()
	 * @return DeclarationList
	 */
	public function parseDeclarationList() {
		$this->consumeToken(); // Move to the first token
		return $this->consumeDeclarationOrAtRuleList( false );
	}

	/**
	 * Parse a list of declarations and at-rules
	 * @note This is the entry point the standard calls "parse a list of declarations"
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-declarations
	 * @return DeclarationOrAtRuleList
	 */
	public function parseDeclarationOrAtRuleList() {
		$this->consumeToken(); // Move to the first token
		return $this->consumeDeclarationOrAtRuleList();
	}

	/**
	 * Parse a (non-whitespace) component value
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-component-value
	 * @return ComponentValue|null
	 */
	public function parseComponentValue() {
		// 1. and 2.
		$this->consumeTokenAndWhitespace();

		// 3.
		if ( $this->currentToken->type() === Token::T_EOF ) {
			$this->parseError( 'unexpected-eof', $this->currentToken ); // "return a syntax error"?
			return null;
		}

		// 4.
		$value = $this->consumeComponentValue();
		// The spec says to return a syntax error if nothing is returned, but
		// that can never happen and the Editor's Draft removed that language.

		// 5.
		$this->consumeTokenAndWhitespace();

		// 6.
		if ( $this->currentToken->type() === Token::T_EOF ) {
			return $value;
		} else {
			$this->parseError( 'expected-eof', $this->currentToken ); // "return a syntax error"?
			return null;
		}

	}

	/**
	 * Parse a list of component values
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#parse-a-list-of-component-values
	 * @return ComponentValueList
	 */
	public function parseComponentValueList() {
		$list = new ComponentValueList();
		while ( true ) {
			$this->consumeToken(); // Move to the first/next token
			$value = $this->consumeComponentValue();
			if ( $value instanceof Token && $value->type() === Token::T_EOF ) {
				break;
			}
			$list->add( $value );
		}

		return $list;
	}

	/**
	 * Consume a list of rules
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-rules
	 * @param boolean $topLevel Determines the behavior when CDO and CDC tokens are encountered
	 * @return RuleList
	 */
	protected function consumeRuleList( $topLevel ) {
		$list = new RuleList();
		while ( true ) {
			$rule = false;
			switch ( $this->currentToken->type() ) {
				case Token::T_WHITESPACE:
					break;

				case Token::T_EOF:
					break 2;

				case Token::T_CDO:
				case Token::T_CDC:
					if ( $topLevel ) {
						// Do nothing
					} else {
						$rule = $this->consumeQualifiedRule();
					}
					break;

				case Token::T_AT_KEYWORD:
					$rule = $this->consumeAtRule();
					break;

				default:
					$rule = $this->consumeQualifiedRule();
					break;
			}

			if ( $rule ) {
				$list->add( $rule );
			}
			$this->consumeToken();
		}

		return $list;
	}

	/**
	 * Consume a list of declarations and at-rules
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-list-of-declarations
	 * @param bool $allowAtRules Whether to allow at-rules. This flag is not in
	 *  the spec, and is used to implement the non-spec self::parseDeclarationList().
	 * @return DeclarationOrAtRuleList|DeclarationList
	 */
	protected function consumeDeclarationOrAtRuleList( $allowAtRules = true ) {
		$list = $allowAtRules ? new DeclarationOrAtRuleList() : new DeclarationList();
		while ( true ) {
			$declaration = false;
			switch ( $this->currentToken->type() ) {
				case Token::T_WHITESPACE:
					break;

				case Token::T_SEMICOLON:
					$declaration = null;
					break;

				case Token::T_EOF:
					break 2;

				case Token::T_AT_KEYWORD:
					if ( $allowAtRules ) {
						$declaration = $this->consumeAtRule();
					} else {
						$this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
						$this->consumeAtRule();
						$declaration = null;
					}
					break;

				case Token::T_IDENT:
					// The draft changes this to ComponentValue instead of Token, which makes more sense.
					$cvs = [];
					do {
						$cvs[] = $this->consumeComponentValue();
						$this->consumeToken();
					} while (
						$this->currentToken->type() !== Token::T_SEMICOLON &&
						$this->currentToken->type() !== Token::T_EOF
					);
					$tokens = ( new ComponentValueList( $cvs ) )->toTokenArray();
					$parser = static::newFromTokens( $tokens, $this->currentToken );
					$parser->consumeToken(); // Load that first token
					$declaration = $parser->consumeDeclaration();
					// Propagate any errors
					$this->parseErrors = array_merge( $this->parseErrors, $parser->parseErrors );
					break;

				default:
					$this->parseError( 'unexpected-token-in-declaration-list', $this->currentToken );
					do {
						$this->consumeComponentValue();
						$this->consumeToken();
					} while (
						$this->currentToken->type() !== Token::T_SEMICOLON &&
						$this->currentToken->type() !== Token::T_EOF
					);
					$declaration = null;
					break;
			}

			if ( $declaration ) {
				$list->add( $declaration );
			}
			$this->consumeToken();
		}

		return $list;
	}

	/**
	 * Consume a declaration
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-declaration
	 * @return Declaration|null
	 */
	protected function consumeDeclaration() {
		$declaration = new Declaration( $this->currentToken );

		// 2.
		$this->consumeTokenAndWhitespace();

		// 3.
		if ( $this->currentToken->type() !== Token::T_COLON ) {
			$this->parseError( 'expected-colon', $this->currentToken );
			return null;
		}
		$this->consumeToken();

		// 4.
		$value = $declaration->getValue();
		$l1 = $l2 = -1;
		while ( $this->currentToken->type() !== Token::T_EOF ) {
			// The draft changes this to ComponentValue instead of Token, which makes more sense.
			$value->add( $this->consumeComponentValue() );
			if ( $this->currentToken->type() !== Token::T_WHITESPACE ) {
				$l1 = $l2;
				$l2 = $value->count() - 1;
			}
			$this->consumeToken();
		}

		// 5.
		$v1 = $l1 >= 0 ? $value[$l1] : null;
		$v2 = $l2 >= 0 ? $value[$l2] : null;
		if ( $v1 instanceof Token && $v1->type() === Token::T_DELIM && $v1->value() === '!' &&
			$v2 instanceof Token && $v2->type() === Token::T_IDENT &&
			!strcasecmp( $v2->value(), 'important' )
		) {
			// Technically it doesn't say to remove any whitespace within/after
			// the "!important" too, but it makes sense to do so.
			while ( isset( $value[$l1] ) ) {
				$value->remove( $l1 );
			}
			$declaration->setImportant( true );
		}

		// 6.
		return $declaration;
	}

	/**
	 * Consume an at-rule
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-an-at-rule
	 * @return AtRule
	 */
	protected function consumeAtRule() {
		$rule = new AtRule( $this->currentToken );
		$this->consumeToken();
		while ( true ) {
			switch ( $this->currentToken->type() ) {
				case Token::T_SEMICOLON:
					return $rule;

				case Token::T_EOF:
					// Parse error from the editor's draft as of 2017-01-11
					if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
						$this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
					}
					return $rule;

				case Token::T_LEFT_BRACE:
					$rule->setBlock( $this->consumeSimpleBlock( true ) );
					return $rule;

				default:
					$rule->getPrelude()->add( $this->consumeComponentValue() );
					break;
			}
			$this->consumeToken();
		}
		// @codeCoverageIgnoreStart
	}
	// @codeCoverageIgnoreEnd

	/**
	 * Consume a qualified rule
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-qualified-rule
	 * @return QualifiedRule|null
	 */
	protected function consumeQualifiedRule() {
		$rule = new QualifiedRule( $this->currentToken );
		while ( true ) {
			switch ( $this->currentToken->type() ) {
				case Token::T_EOF:
					if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
						$this->parseError( 'unexpected-eof-in-rule', $this->currentToken );
					}
					return null;

				case Token::T_LEFT_BRACE:
					$rule->setBlock( $this->consumeSimpleBlock( true ) );
					return $rule;

				default:
					$rule->getPrelude()->add( $this->consumeComponentValue() );
					break;
			}
			$this->consumeToken();
		}
		// @codeCoverageIgnoreStart
	}
	// @codeCoverageIgnoreEnd

	/**
	 * Consume a component value
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-component-value
	 * @return ComponentValue
	 */
	protected function consumeComponentValue() {
		if ( ++$this->cvDepth > static::CV_DEPTH_LIMIT ) {
			$this->parseError( 'recursion-depth-exceeded', $this->currentToken );
			// There's no way to safely recover from this without more recursion.
			// So just eat the rest of the input, then return a
			// specially-flagged EOF so we can avoid 100 "unexpected EOF"
			// errors.
			$position = $this->currentToken->getPosition();
			while ( $this->currentToken->type() !== Token::T_EOF ) {
				$this->consumeToken();
			}
			$this->currentToken = new Token( Token::T_EOF, [
				'position' => $position,
				'typeFlag' => 'recursion-depth-exceeded'
			] );
		}

		switch ( $this->currentToken->type() ) {
			case Token::T_LEFT_BRACE:
			case Token::T_LEFT_BRACKET:
			case Token::T_LEFT_PAREN:
				$ret = $this->consumeSimpleBlock();
				break;

			case Token::T_FUNCTION:
				$ret = $this->consumeFunction();
				break;

			default:
				$ret = $this->currentToken;
				break;
		}

		$this->cvDepth--;
		return $ret;
	}

	/**
	 * Consume a simple block
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-simple-block
	 * @return SimpleBlock
	 */
	protected function consumeSimpleBlock() {
		$block = new SimpleBlock( $this->currentToken );
		$endTokenType = $block->getEndTokenType();
		$this->consumeToken();
		while ( true ) {
			switch ( $this->currentToken->type() ) {
				case Token::T_EOF:
					// Parse error from the editor's draft as of 2017-01-12
					if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
						$this->parseError( 'unexpected-eof-in-block', $this->currentToken );
					}
					return $block;

				case $endTokenType:
					return $block;

				default:
					$block->getValue()->add( $this->consumeComponentValue() );
					break;
			}
			$this->consumeToken();
		}
		// @codeCoverageIgnoreStart
	}
	// @codeCoverageIgnoreEnd

	/**
	 * Consume a function
	 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#consume-a-function
	 * @return CSSFunction
	 */
	protected function consumeFunction() {
		$function = new CSSFunction( $this->currentToken );
		$this->consumeToken();

		while ( true ) {
			switch ( $this->currentToken->type() ) {
				case Token::T_EOF:
					// Parse error from the editor's draft as of 2017-01-12
					if ( $this->currentToken->typeFlag() !== 'recursion-depth-exceeded' ) {
						$this->parseError( 'unexpected-eof-in-function', $this->currentToken );
					}
					return $function;

				case Token::T_RIGHT_PAREN:
					return $function;

				default:
					$function->getValue()->add( $this->consumeComponentValue() );
					break;
			}
			$this->consumeToken();
		}
		// @codeCoverageIgnoreStart
	}
	// @codeCoverageIgnoreEnd
}