string = (string)$string; $this->len = strlen( $this->string ); // HHVM 3.4 and older come with an outdated version of libmbfl that // incorrectly allows values above U+10FFFF, so we have to check // for them separately. (This issue also exists in PHP 5.3 and // older, which are no longer supported.) // @codeCoverageIgnoreStart if ( $newPHP === null ) { $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); } // @codeCoverageIgnoreEnd if ( !mb_check_encoding( $this->string, 'UTF-8' ) || !$newPHP && preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $this->string ) !== 0 ) { throw new \InvalidArgumentException( '$string is not valid UTF-8' ); } } public function readCharacter() { if ( $this->putBack ) { return array_pop( $this->putBack ); } if ( $this->pos >= $this->len ) { return self::EOF; } // We already checked that the string is valid UTF-8 in the // constructor, so we can do a quick binary "get next character" here. $p = $this->pos; $c = $this->string[$p]; $cc = ord( $this->string[$p] ); if ( $cc <= 0x7f ) { $this->pos += 1; return $c; } elseif ( ( $cc & 0xe0 ) === 0xc0 ) { $this->pos += 2; return substr( $this->string, $p, 2 ); } elseif ( ( $cc & 0xf0 ) === 0xe0 ) { $this->pos += 3; return substr( $this->string, $p, 3 ); } elseif ( ( $cc & 0xf8 ) === 0xf0 ) { $this->pos += 4; return substr( $this->string, $p, 4 ); } else { // WTF? Should never get here because it should have failed // validation in the constructor. // @codeCoverageIgnoreStart throw new \UnexpectedValueException( sprintf( 'Unexpected byte %02X in string at position %d.', $cc, $this->pos ) ); // @codeCoverageIgnoreEnd } } public function putBackCharacter( $char ) { if ( $char !== self::EOF ) { $this->putBack[] = $char; } } }