mastodon/spec/lib/link_details_extractor_spec.rb

# frozen_string_literal: true

require 'rails_helper'

RSpec.describe LinkDetailsExtractor do
  subject { described_class.new(original_url, html, nil) }

  let(:original_url) { 'https://example.com/dog.html?tracking=123' }

  describe '#canonical_url' do
    let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }

    context 'when canonical URL points to the same host' do
      let(:url) { 'https://example.com/dog.html' }

      it 'ignores the canonical URLs' do
        expect(subject.canonical_url).to eq 'https://example.com/dog.html'
      end
    end

    context 'when canonical URL points to another host' do
      let(:url) { 'https://different.example.net/dog.html' }

      it 'ignores the canonical URLs' do
        expect(subject.canonical_url).to eq original_url
      end
    end

    context 'when canonical URL is set to "null"' do
      let(:url) { 'null' }

      it 'ignores the canonical URLs' do
        expect(subject.canonical_url).to eq original_url
      end
    end

    context 'when canonical URL is set to "undefined"' do
      let(:url) { 'undefined' }

      it 'ignores the canonical URLs' do
        expect(subject.canonical_url).to eq original_url
      end
    end
  end

  context 'when only basic metadata is present' do
    let(:html) { <<~HTML }
      <!doctype html>
      <html lang="en">
      <head>
        <title>Man bites dog</title>
        <meta name="descripTION" content="A dog&#39;s tale">
        <link rel="pretty IcoN" href="/favicon.ico">
      </head>
      </html>
    HTML

    it 'extracts the expected values from html metadata' do
      expect(subject)
        .to have_attributes(
          title: eq('Man bites dog'),
          description: eq("A dog's tale"),
          language: eq('en'),
          icon: eq('https://example.com/favicon.ico')
        )
    end
  end

  context 'when structured data is present' do
    let(:ld_json) do
      {
        '@context' => 'https://schema.org',
        '@type' => 'NewsArticle',
        'headline' => 'Man bites dog',
        'description' => "A dog's tale",
        'datePublished' => '2022-01-31T19:53:00+00:00',
        'author' => {
          '@type' => 'Organization',
          'name' => 'Charlie Brown',
        },
        'publisher' => {
          '@type' => 'NewsMediaOrganization',
          'name' => 'Pet News',
          'url' => 'https://example.com',
        },
        'inLanguage' => {
          name: 'English',
          alternateName: 'en',
        },
      }.to_json
    end

    shared_examples 'structured data' do
      it 'extracts the expected values from structured data' do
        expect(subject)
          .to have_attributes(
            title: eq('Man bites dog'),
            description: eq("A dog's tale"),
            published_at: eq('2022-01-31T19:53:00+00:00'),
            author_name: eq('Charlie Brown'),
            provider_name: eq('Pet News'),
            language: eq('en')
          )
      end
    end

    context 'when is wrapped in CDATA tags' do
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
          <head>
            <script type="application/ld+json">
              //<![CDATA[
              #{ld_json}
              //]]>
            </script>
          </head>
        </html>
      HTML

      it_behaves_like 'structured data'
    end

    context 'with the first tag is invalid JSON' do
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            invalid LD+JSON
          </script>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it_behaves_like 'structured data'
    end

    context 'with the first tag is null' do
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            null
          </script>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it_behaves_like 'structured data'
    end

    context 'with preceding block of unsupported LD+JSON' do
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            [
              {
                "@context": "https://schema.org",
                "@type": "ItemList",
                "url": "https://example.com/cat.html",
                "name": "Man bites cat",
                "description": "A cat's tale"
              },
              {
                "@context": "https://schema.org",
                "@type": "BreadcrumbList",
                "itemListElement":[
                  {
                    "@type": "ListItem",
                    "position": 1,
                    "item": {
                      "@id": "https://www.example.com",
                      "name": "Cat News"
                    }
                  }
                ]
              }
            ]
          </script>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it_behaves_like 'structured data'
    end

    context 'with unsupported in same block LD+JSON' do
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            [
              {
                "@context": "https://schema.org",
                "@type": "ItemList",
                "url": "https://example.com/cat.html",
                "name": "Man bites cat",
                "description": "A cat's tale"
              },
              #{ld_json}
            ]
          </script>
        </body>
        </html>
      HTML

      it_behaves_like 'structured data'
    end

    context 'with author names as array' do
      let(:ld_json) do
        {
          '@context' => 'https://schema.org',
          '@type' => 'NewsArticle',
          'headline' => 'A lot of authors',
          'description' => 'But we decided to cram them into one',
          'author' => {
            '@type' => 'Person',
            'name' => ['Author 1', 'Author 2'],
          },
        }.to_json
      end
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it 'joins author names' do
        expect(subject.author_name).to eq 'Author 1, Author 2'
      end
    end

    context 'with embedded arrays' do
      let(:ld_json) do
        {
          '@context' => 'https://schema.org',
          '@type' => 'NewsArticle',
          'headline' => 'A lot of authors',
          'description' => 'But we decided to cram them into one',
          'author' => [[{
            '@type' => 'Person',
            'name' => ['Author 1'],
          }]],
          'publisher' => [[{
            '@type' => 'NewsMediaOrganization',
            'name' => 'Pet News',
            'url' => 'https://example.com',
          }]],
        }.to_json
      end
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it 'gives correct author_name' do
        expect(subject.author_name).to eq 'Author 1'
      end

      it 'gives provider_name' do
        expect(subject.provider_name).to eq 'Pet News'
      end
    end

    context 'with headline and description as language tagged strings' do
      let(:ld_json) do
        {
          '@context' => 'https://schema.org',
          '@type' => 'NewsArticle',
          'headline' => {
            '@value' => 'Title in English',
            '@language' => 'en',
          },
          'description' => {
            '@value' => 'Text in English.',
            '@language' => 'en',
          },
        }.to_json
      end
      let(:html) { <<~HTML }
        <!doctype html>
        <html>
        <body>
          <script type="application/ld+json">
            #{ld_json}
          </script>
        </body>
        </html>
      HTML

      it 'gives correct title' do
        expect(subject.title).to eq 'Title in English'
      end

      it 'gives correct description' do
        expect(subject.description).to eq 'Text in English.'
      end
    end
  end

  context 'when Open Graph protocol data is present' do
    let(:html) { <<~HTML }
      <!doctype html>
      <html>
      <head>
        <meta property="og:url" content="https://example.com/dog.html">
        <meta property="og:title" content="Man bites dog">
        <meta property="OG:description" content="A dog's tale">
        <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
        <meta property="og:author" content="Charlie Brown">
        <meta property="og:locale" content="en">
        <meta property="og:image" content="https://example.com/snoopy.jpg">
        <meta property="og:image:alt" content="A good boy">
        <meta property="og:site_name" content="Pet News">
      </head>
      </html>
    HTML

    it 'extracts the expected values from open graph data' do
      expect(subject)
        .to have_attributes(
          canonical_url: eq('https://example.com/dog.html'),
          title: eq('Man bites dog'),
          description: eq("A dog's tale"),
          published_at: eq('2022-01-31T19:53:00+00:00'),
          author_name: eq('Charlie Brown'),
          language: eq('en'),
          image: eq('https://example.com/snoopy.jpg'),
          image_alt: eq('A good boy'),
          provider_name: eq('Pet News')
        )
    end
  end
end