You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
114 lines
4.4 KiB
114 lines
4.4 KiB
import puppeteer from 'puppeteer';
|
|
|
|
/**
|
|
* Threads Search Scraper
|
|
* Navigates to the search page, enters a keyword, and extracts post URLs.
|
|
* * @param {string} keyword - The search term.
|
|
* @param {number} limit - Maximum number of URLs to return.
|
|
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
|
|
*/
|
|
export async function searchThreads(keyword, limit = 20, cookies = []) {
|
|
if (!keyword) {
|
|
throw new Error("Please provide a keyword for the search.");
|
|
}
|
|
|
|
const searchUrl = `https://www.threads.net/search?q=${encodeURIComponent(keyword)}&serp_type=default`;
|
|
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: "new",
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox'
|
|
]
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
|
|
// Anti-detection headers
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
|
|
await page.setExtraHTTPHeaders({
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
});
|
|
|
|
// Inject cookies if provided to bypass login wall
|
|
if (cookies && cookies.length > 0) {
|
|
console.log("[Auth] Injecting session cookies...");
|
|
await page.setCookie(...cookies);
|
|
}
|
|
|
|
await page.setViewport({ width: 1280, height: 900 });
|
|
|
|
// Navigate to search results
|
|
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
|
|
// Check if we were redirected to the login page
|
|
const currentUrl = page.url();
|
|
if (currentUrl.includes('/login')) {
|
|
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
|
|
throw new Error("Authentication required: Threads search is restricted to logged-in users.");
|
|
}
|
|
|
|
// Wait for the results to start appearing
|
|
try {
|
|
// Threads search result container or post links
|
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
|
|
} catch (e) {
|
|
console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
|
|
// Take a screenshot for debugging if needed (local environment only)
|
|
// await page.screenshot({ path: 'search_debug.png' });
|
|
}
|
|
|
|
// Scroll logic to gather more results
|
|
await page.evaluate(async (maxItems) => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
let distance = 500;
|
|
let timer = setInterval(() => {
|
|
let scrollHeight = document.body.scrollHeight;
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
|
|
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
|
|
// Stop if we hit the limit, bottom of page, or a safety cap
|
|
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
|
|
clearInterval(timer);
|
|
resolve();
|
|
}
|
|
}, 300);
|
|
});
|
|
}, limit);
|
|
|
|
// Extracting the URLs
|
|
const postUrls = await page.evaluate(() => {
|
|
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
|
|
return links
|
|
.map(a => a.href)
|
|
// Filter for actual post links and ignore UI elements like repost/share buttons
|
|
.filter(href => {
|
|
const isPost = href.includes('/post/');
|
|
const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
|
|
return isPost && isNotAction;
|
|
});
|
|
});
|
|
|
|
// Deduplicate using a Set
|
|
const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
|
|
|
|
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
|
|
|
|
return {
|
|
keyword,
|
|
count: uniqueUrls.length,
|
|
urls: uniqueUrls
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error(`[Search Error] ${error.message}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
} |