Introduction
Extracting all links from a page is a common task — for link auditing, SEO analysis, building sitemaps, or scraping. JavaScript gives you several ways to do this, from simple DOM queries to more advanced filtering and deduplication. See Javascript Guide for more context. See Javascript Guide for more context.
Basic: Get All Anchor Tags
The simplest approach uses querySelectorAll('a') to grab every <a> element:
const links = document.querySelectorAll('a');
links.forEach(link => {
console.log(link.href, link.textContent.trim());
});
Collect Links into an Array
function getAllLinks() {
const links = document.querySelectorAll('a');
return Array.from(links).map(link => ({
text: link.textContent.trim(),
href: link.href,
title: link.title || null,
target: link.target || null
}));
}
const links = getAllLinks();
console.log(JSON.stringify(links, null, 2));
Wait for DOM Ready
If your script runs before the page finishes loading, wrap it in a DOMContentLoaded listener:
const sites = [];
document.addEventListener('DOMContentLoaded', () => {
const links = document.querySelectorAll('a');
links.forEach(link => {
sites.push({
name: link.outerText.trim(),
url: link.href
});
});
console.log(JSON.stringify(sites, null, 2));
});
Filter by Link Type
External Links Only
function getExternalLinks() {
const currentHost = window.location.hostname;
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
try {
const url = new URL(link.href);
return url.hostname !== currentHost;
} catch {
return false; // invalid URL
}
})
.map(link => ({
text: link.textContent.trim(),
href: link.href,
host: new URL(link.href).hostname
}));
}
console.log(getExternalLinks());
Internal Links Only
function getInternalLinks() {
const currentHost = window.location.hostname;
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
try {
const url = new URL(link.href);
return url.hostname === currentHost;
} catch {
return false;
}
})
.map(link => link.href);
}
Links with Specific Attributes
// Links that open in a new tab
const newTabLinks = document.querySelectorAll('a[target="_blank"]');
// Links with a specific class
const navLinks = document.querySelectorAll('nav a');
// Links pointing to PDFs
const pdfLinks = Array.from(document.querySelectorAll('a[href]'))
.filter(link => link.href.endsWith('.pdf'));
// Links with no href (anchor-only)
const anchorLinks = document.querySelectorAll('a:not([href])');
Deduplicate Links
function getUniqueLinks() {
const seen = new Set();
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
const url = link.href;
if (seen.has(url)) return false;
seen.add(url);
return true;
})
.map(link => ({
text: link.textContent.trim(),
href: link.href
}));
}
Run in Browser Console (Bookmarklet)
You can paste this directly into the browser console to extract links from any page:
// Paste in browser console
(function() {
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({ text: a.textContent.trim(), href: a.href }))
.filter(l => l.href.startsWith('http'));
const unique = [...new Map(links.map(l => [l.href, l])).values()];
console.table(unique);
copy(JSON.stringify(unique, null, 2)); // copies to clipboard
console.log(`${unique.length} unique links copied to clipboard`);
})();
Node.js: Extract Links from HTML String
When working server-side (e.g., scraping), use a parser like cheerio:
npm install cheerio
import * as cheerio from 'cheerio';
import fetch from 'node-fetch';
async function extractLinks(url) {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
const links = [];
$('a[href]').each((_, el) => {
const href = $(el).attr('href');
const text = $(el).text().trim();
// Resolve relative URLs
try {
const absolute = new URL(href, url).href;
links.push({ text, href: absolute });
} catch {
// skip invalid URLs
}
});
return links;
}
const links = await extractLinks('https://example.com');
console.log(links);
Check for Broken Links
async function checkLinks(links) {
const results = await Promise.allSettled(
links.map(async link => {
const response = await fetch(link.href, { method: 'HEAD' });
return {
href: link.href,
status: response.status,
ok: response.ok
};
})
);
return results.map(r => r.status === 'fulfilled' ? r.value : {
href: 'unknown',
status: 0,
ok: false,
error: r.reason?.message
});
}
const links = getUniqueLinks();
const results = await checkLinks(links);
const broken = results.filter(r => !r.ok);
console.log('Broken links:', broken);
Practical: Export Links as CSV
function exportLinksAsCSV() {
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({
text: a.textContent.trim().replace(/,/g, ' '),
href: a.href
}));
const csv = [
'Text,URL',
...links.map(l => `"${l.text}","${l.href}"`)
].join('\n');
const blob = new Blob([csv], { type: 'text/csv' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'links.csv';
a.click();
URL.revokeObjectURL(url);
}
exportLinksAsCSV();
Summary
| Method | Use Case |
|---|---|
querySelectorAll('a') |
Get all anchor elements |
querySelectorAll('a[href]') |
Only links with an href |
| Filter by hostname | Separate internal vs external |
Set deduplication |
Remove duplicate URLs |
cheerio (Node.js) |
Parse HTML server-side |
fetch HEAD |
Check if links are alive |
Comments