Skip to main content

How to Get All Links on an HTML Page with JavaScript

Created: February 23, 2022 4 min read

Introduction

Extracting all links from a page is a common task — for link auditing, SEO analysis, building sitemaps, or scraping. JavaScript gives you several ways to do this, from simple DOM queries to more advanced filtering and deduplication. See Javascript Guide for more context. See Javascript Guide for more context.

Basic: Get All Anchor Tags

The simplest approach uses querySelectorAll('a') to grab every <a> element:

const links = document.querySelectorAll('a');

links.forEach(link => {
  console.log(link.href, link.textContent.trim());
});
function getAllLinks() {
  const links = document.querySelectorAll('a');

  return Array.from(links).map(link => ({
    text: link.textContent.trim(),
    href: link.href,
    title: link.title || null,
    target: link.target || null
  }));
}

const links = getAllLinks();
console.log(JSON.stringify(links, null, 2));

Wait for DOM Ready

If your script runs before the page finishes loading, wrap it in a DOMContentLoaded listener:

const sites = [];

document.addEventListener('DOMContentLoaded', () => {
  const links = document.querySelectorAll('a');

  links.forEach(link => {
    sites.push({
      name: link.outerText.trim(),
      url: link.href
    });
  });

  console.log(JSON.stringify(sites, null, 2));
});
function getExternalLinks() {
  const currentHost = window.location.hostname;

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      try {
        const url = new URL(link.href);
        return url.hostname !== currentHost;
      } catch {
        return false; // invalid URL
      }
    })
    .map(link => ({
      text: link.textContent.trim(),
      href: link.href,
      host: new URL(link.href).hostname
    }));
}

console.log(getExternalLinks());
function getInternalLinks() {
  const currentHost = window.location.hostname;

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      try {
        const url = new URL(link.href);
        return url.hostname === currentHost;
      } catch {
        return false;
      }
    })
    .map(link => link.href);
}
// Links that open in a new tab
const newTabLinks = document.querySelectorAll('a[target="_blank"]');

// Links with a specific class
const navLinks = document.querySelectorAll('nav a');

// Links pointing to PDFs
const pdfLinks = Array.from(document.querySelectorAll('a[href]'))
  .filter(link => link.href.endsWith('.pdf'));

// Links with no href (anchor-only)
const anchorLinks = document.querySelectorAll('a:not([href])');
function getUniqueLinks() {
  const seen = new Set();

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      const url = link.href;
      if (seen.has(url)) return false;
      seen.add(url);
      return true;
    })
    .map(link => ({
      text: link.textContent.trim(),
      href: link.href
    }));
}

Run in Browser Console (Bookmarklet)

You can paste this directly into the browser console to extract links from any page:

// Paste in browser console
(function() {
  const links = Array.from(document.querySelectorAll('a[href]'))
    .map(a => ({ text: a.textContent.trim(), href: a.href }))
    .filter(l => l.href.startsWith('http'));

  const unique = [...new Map(links.map(l => [l.href, l])).values()];

  console.table(unique);
  copy(JSON.stringify(unique, null, 2)); // copies to clipboard
  console.log(`${unique.length} unique links copied to clipboard`);
})();

When working server-side (e.g., scraping), use a parser like cheerio:

npm install cheerio
import * as cheerio from 'cheerio';
import fetch from 'node-fetch';

async function extractLinks(url) {
  const response = await fetch(url);
  const html = await response.text();
  const $ = cheerio.load(html);

  const links = [];

  $('a[href]').each((_, el) => {
    const href = $(el).attr('href');
    const text = $(el).text().trim();

    // Resolve relative URLs
    try {
      const absolute = new URL(href, url).href;
      links.push({ text, href: absolute });
    } catch {
      // skip invalid URLs
    }
  });

  return links;
}

const links = await extractLinks('https://example.com');
console.log(links);
async function checkLinks(links) {
  const results = await Promise.allSettled(
    links.map(async link => {
      const response = await fetch(link.href, { method: 'HEAD' });
      return {
        href: link.href,
        status: response.status,
        ok: response.ok
      };
    })
  );

  return results.map(r => r.status === 'fulfilled' ? r.value : {
    href: 'unknown',
    status: 0,
    ok: false,
    error: r.reason?.message
  });
}

const links = getUniqueLinks();
const results = await checkLinks(links);
const broken = results.filter(r => !r.ok);
console.log('Broken links:', broken);
function exportLinksAsCSV() {
  const links = Array.from(document.querySelectorAll('a[href]'))
    .map(a => ({
      text: a.textContent.trim().replace(/,/g, ' '),
      href: a.href
    }));

  const csv = [
    'Text,URL',
    ...links.map(l => `"${l.text}","${l.href}"`)
  ].join('\n');

  const blob = new Blob([csv], { type: 'text/csv' });
  const url = URL.createObjectURL(blob);

  const a = document.createElement('a');
  a.href = url;
  a.download = 'links.csv';
  a.click();

  URL.revokeObjectURL(url);
}

exportLinksAsCSV();

Summary

Method Use Case
querySelectorAll('a') Get all anchor elements
querySelectorAll('a[href]') Only links with an href
Filter by hostname Separate internal vs external
Set deduplication Remove duplicate URLs
cheerio (Node.js) Parse HTML server-side
fetch HEAD Check if links are alive

Resources

Comments

Share this article

Scan to read on mobile