You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

119 lines
4.6 KiB

import puppeteer from 'puppeteer';
/**
* Threads Search Scraper
* Navigates to the search page, enters a keyword, and extracts post URLs.
* * @param {string} keyword - The search term.
* @param {number} limit - Maximum number of URLs to return.
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
*/
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') {
if (!keyword) {
throw new Error("Please provide a keyword for the search.");
}
const searchUrl = `https://www.threads.net/search?`
+`q=${encodeURIComponent(keyword)}`
+`&search_type=TOP`
+`&limit=${limit}`
+`&search_mode=${searchMode}`
+`&media_type=TEXT`;
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
const browser = await puppeteer.launch({
headless: "new",
args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox'
]
});
try {
const page = await browser.newPage();
// Anti-detection headers
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9',
});
// Inject cookies if provided to bypass login wall
if (cookies && cookies.length > 0) {
console.log("[Auth] Injecting session cookies...");
await page.setCookie(...cookies);
}
await page.setViewport({ width: 1280, height: 900 });
// Navigate to search results
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// Check if we were redirected to the login page
const currentUrl = page.url();
if (currentUrl.includes('/login')) {
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
throw new Error("Authentication required: Threads search is restricted to logged-in users.");
}
// Wait for the results to start appearing
try {
// Threads search result container or post links
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
} catch (e) {
console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
// Take a screenshot for debugging if needed (local environment only)
// await page.screenshot({ path: 'search_debug.png' });
}
// Scroll logic to gather more results
await page.evaluate(async (maxItems) => {
await new Promise((resolve) => {
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
// Stop if we hit the limit, bottom of page, or a safety cap
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
clearInterval(timer);
resolve();
}
}, 300);
});
}, limit);
// Extracting the URLs
const postUrls = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
return links
.map(a => a.href)
// Filter for actual post links and ignore UI elements like repost/share buttons
.filter(href => {
const isPost = href.includes('/post/');
const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
return isPost && isNotAction;
});
});
// Deduplicate using a Set
const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
return {
keyword,
count: uniqueUrls.length,
urls: uniqueUrls
};
} catch (error) {
console.error(`[Search Error] ${error.message}`);
throw error;
} finally {
await browser.close();
}
}