Introduction
Extracting all links from a page is a common task โ for link auditing, SEO analysis, building sitemaps, or scraping. JavaScript gives you several ways to do this, from simple DOM queries to more advanced filtering and deduplication.
Basic: Get All Anchor Tags
The simplest approach uses querySelectorAll('a') to grab every <a> element:
const links = document.querySelectorAll('a');
links.forEach(link => {
console.log(link.href, link.textContent.trim());
});
Collect Links into an Array
function getAllLinks() {
const links = document.querySelectorAll('a');
return Array.from(links).map(link => ({
text: link.textContent.trim(),
href: link.href,
title: link.title || null,
target: link.target || null
}));
}
const links = getAllLinks();
console.log(JSON.stringify(links, null, 2));
Wait for DOM Ready
If your script runs before the page finishes loading, wrap it in a DOMContentLoaded listener:
const sites = [];
document.addEventListener('DOMContentLoaded', () => {
const links = document.querySelectorAll('a');
links.forEach(link => {
sites.push({
name: link.outerText.trim(),
url: link.href
});
});
console.log(JSON.stringify(sites, null, 2));
});
Filter by Link Type
External Links Only
function getExternalLinks() {
const currentHost = window.location.hostname;
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
try {
const url = new URL(link.href);
return url.hostname !== currentHost;
} catch {
return false; // invalid URL
}
})
.map(link => ({
text: link.textContent.trim(),
href: link.href,
host: new URL(link.href).hostname
}));
}
console.log(getExternalLinks());
Internal Links Only
function getInternalLinks() {
const currentHost = window.location.hostname;
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
try {
const url = new URL(link.href);
return url.hostname === currentHost;
} catch {
return false;
}
})
.map(link => link.href);
}
Links with Specific Attributes
// Links that open in a new tab
const newTabLinks = document.querySelectorAll('a[target="_blank"]');
// Links with a specific class
const navLinks = document.querySelectorAll('nav a');
// Links pointing to PDFs
const pdfLinks = Array.from(document.querySelectorAll('a[href]'))
.filter(link => link.href.endsWith('.pdf'));
// Links with no href (anchor-only)
const anchorLinks = document.querySelectorAll('a:not([href])');
Deduplicate Links
function getUniqueLinks() {
const seen = new Set();
return Array.from(document.querySelectorAll('a[href]'))
.filter(link => {
const url = link.href;
if (seen.has(url)) return false;
seen.add(url);
return true;
})
.map(link => ({
text: link.textContent.trim(),
href: link.href
}));
}
Run in Browser Console (Bookmarklet)
You can paste this directly into the browser console to extract links from any page:
// Paste in browser console
(function() {
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({ text: a.textContent.trim(), href: a.href }))
.filter(l => l.href.startsWith('http'));
const unique = [...new Map(links.map(l => [l.href, l])).values()];
console.table(unique);
copy(JSON.stringify(unique, null, 2)); // copies to clipboard
console.log(`${unique.length} unique links copied to clipboard`);
})();
Node.js: Extract Links from HTML String
When working server-side (e.g., scraping), use a parser like cheerio:
npm install cheerio
import * as cheerio from 'cheerio';
import fetch from 'node-fetch';
async function extractLinks(url) {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
const links = [];
$('a[href]').each((_, el) => {
const href = $(el).attr('href');
const text = $(el).text().trim();
// Resolve relative URLs
try {
const absolute = new URL(href, url).href;
links.push({ text, href: absolute });
} catch {
// skip invalid URLs
}
});
return links;
}
const links = await extractLinks('https://example.com');
console.log(links);
Check for Broken Links
async function checkLinks(links) {
const results = await Promise.allSettled(
links.map(async link => {
const response = await fetch(link.href, { method: 'HEAD' });
return {
href: link.href,
status: response.status,
ok: response.ok
};
})
);
return results.map(r => r.status === 'fulfilled' ? r.value : {
href: 'unknown',
status: 0,
ok: false,
error: r.reason?.message
});
}
const links = getUniqueLinks();
const results = await checkLinks(links);
const broken = results.filter(r => !r.ok);
console.log('Broken links:', broken);
Practical: Export Links as CSV
function exportLinksAsCSV() {
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({
text: a.textContent.trim().replace(/,/g, ' '),
href: a.href
}));
const csv = [
'Text,URL',
...links.map(l => `"${l.text}","${l.href}"`)
].join('\n');
const blob = new Blob([csv], { type: 'text/csv' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'links.csv';
a.click();
URL.revokeObjectURL(url);
}
exportLinksAsCSV();
Summary
| Method | Use Case |
|---|---|
querySelectorAll('a') |
Get all anchor elements |
querySelectorAll('a[href]') |
Only links with an href |
| Filter by hostname | Separate internal vs external |
Set deduplication |
Remove duplicate URLs |
cheerio (Node.js) |
Parse HTML server-side |
fetch HEAD |
Check if links are alive |
Comments