Skip to main content
โšก Calmops

How to Get All Links on an HTML Page with JavaScript

Introduction

Extracting all links from a page is a common task โ€” for link auditing, SEO analysis, building sitemaps, or scraping. JavaScript gives you several ways to do this, from simple DOM queries to more advanced filtering and deduplication.

Basic: Get All Anchor Tags

The simplest approach uses querySelectorAll('a') to grab every <a> element:

const links = document.querySelectorAll('a');

links.forEach(link => {
  console.log(link.href, link.textContent.trim());
});
function getAllLinks() {
  const links = document.querySelectorAll('a');

  return Array.from(links).map(link => ({
    text: link.textContent.trim(),
    href: link.href,
    title: link.title || null,
    target: link.target || null
  }));
}

const links = getAllLinks();
console.log(JSON.stringify(links, null, 2));

Wait for DOM Ready

If your script runs before the page finishes loading, wrap it in a DOMContentLoaded listener:

const sites = [];

document.addEventListener('DOMContentLoaded', () => {
  const links = document.querySelectorAll('a');

  links.forEach(link => {
    sites.push({
      name: link.outerText.trim(),
      url: link.href
    });
  });

  console.log(JSON.stringify(sites, null, 2));
});
function getExternalLinks() {
  const currentHost = window.location.hostname;

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      try {
        const url = new URL(link.href);
        return url.hostname !== currentHost;
      } catch {
        return false; // invalid URL
      }
    })
    .map(link => ({
      text: link.textContent.trim(),
      href: link.href,
      host: new URL(link.href).hostname
    }));
}

console.log(getExternalLinks());
function getInternalLinks() {
  const currentHost = window.location.hostname;

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      try {
        const url = new URL(link.href);
        return url.hostname === currentHost;
      } catch {
        return false;
      }
    })
    .map(link => link.href);
}
// Links that open in a new tab
const newTabLinks = document.querySelectorAll('a[target="_blank"]');

// Links with a specific class
const navLinks = document.querySelectorAll('nav a');

// Links pointing to PDFs
const pdfLinks = Array.from(document.querySelectorAll('a[href]'))
  .filter(link => link.href.endsWith('.pdf'));

// Links with no href (anchor-only)
const anchorLinks = document.querySelectorAll('a:not([href])');
function getUniqueLinks() {
  const seen = new Set();

  return Array.from(document.querySelectorAll('a[href]'))
    .filter(link => {
      const url = link.href;
      if (seen.has(url)) return false;
      seen.add(url);
      return true;
    })
    .map(link => ({
      text: link.textContent.trim(),
      href: link.href
    }));
}

Run in Browser Console (Bookmarklet)

You can paste this directly into the browser console to extract links from any page:

// Paste in browser console
(function() {
  const links = Array.from(document.querySelectorAll('a[href]'))
    .map(a => ({ text: a.textContent.trim(), href: a.href }))
    .filter(l => l.href.startsWith('http'));

  const unique = [...new Map(links.map(l => [l.href, l])).values()];

  console.table(unique);
  copy(JSON.stringify(unique, null, 2)); // copies to clipboard
  console.log(`${unique.length} unique links copied to clipboard`);
})();

When working server-side (e.g., scraping), use a parser like cheerio:

npm install cheerio
import * as cheerio from 'cheerio';
import fetch from 'node-fetch';

async function extractLinks(url) {
  const response = await fetch(url);
  const html = await response.text();
  const $ = cheerio.load(html);

  const links = [];

  $('a[href]').each((_, el) => {
    const href = $(el).attr('href');
    const text = $(el).text().trim();

    // Resolve relative URLs
    try {
      const absolute = new URL(href, url).href;
      links.push({ text, href: absolute });
    } catch {
      // skip invalid URLs
    }
  });

  return links;
}

const links = await extractLinks('https://example.com');
console.log(links);
async function checkLinks(links) {
  const results = await Promise.allSettled(
    links.map(async link => {
      const response = await fetch(link.href, { method: 'HEAD' });
      return {
        href: link.href,
        status: response.status,
        ok: response.ok
      };
    })
  );

  return results.map(r => r.status === 'fulfilled' ? r.value : {
    href: 'unknown',
    status: 0,
    ok: false,
    error: r.reason?.message
  });
}

const links = getUniqueLinks();
const results = await checkLinks(links);
const broken = results.filter(r => !r.ok);
console.log('Broken links:', broken);
function exportLinksAsCSV() {
  const links = Array.from(document.querySelectorAll('a[href]'))
    .map(a => ({
      text: a.textContent.trim().replace(/,/g, ' '),
      href: a.href
    }));

  const csv = [
    'Text,URL',
    ...links.map(l => `"${l.text}","${l.href}"`)
  ].join('\n');

  const blob = new Blob([csv], { type: 'text/csv' });
  const url = URL.createObjectURL(blob);

  const a = document.createElement('a');
  a.href = url;
  a.download = 'links.csv';
  a.click();

  URL.revokeObjectURL(url);
}

exportLinksAsCSV();

Summary

Method Use Case
querySelectorAll('a') Get all anchor elements
querySelectorAll('a[href]') Only links with an href
Filter by hostname Separate internal vs external
Set deduplication Remove duplicate URLs
cheerio (Node.js) Parse HTML server-side
fetch HEAD Check if links are alive

Resources

Comments