|
|
|
@ -1,27 +1,24 @@ |
|
|
|
import puppeteer from 'puppeteer'; |
|
|
|
import puppeteer from 'puppeteer'; |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* Threads Search Scraper |
|
|
|
* Optimized Threads Search Scraper |
|
|
|
* Navigates to the search page, enters a keyword, and extracts post URLs. |
|
|
|
* Uses a MutationObserver to catch posts in a virtualized list and improved scrolling. |
|
|
|
* * @param {string} keyword - The search term. |
|
|
|
|
|
|
|
* @param {number} limit - Maximum number of URLs to return. |
|
|
|
|
|
|
|
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login. |
|
|
|
|
|
|
|
*/ |
|
|
|
*/ |
|
|
|
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') { |
|
|
|
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') { |
|
|
|
if (!keyword) { |
|
|
|
if (!keyword) { |
|
|
|
throw new Error("Please provide a keyword for the search."); |
|
|
|
throw new Error("Please provide a keyword for the search."); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const searchUrl = `https://www.threads.net/search?` |
|
|
|
const searchUrl = `https://www.threads.net/search?` |
|
|
|
+`q=${encodeURIComponent(keyword)}` |
|
|
|
+ `q=${encodeURIComponent(keyword)}` |
|
|
|
+`&search_type=TOP` |
|
|
|
+ `&search_type=TOP` |
|
|
|
+`&limit=${limit}` |
|
|
|
+ `&search_mode=${searchMode}`; |
|
|
|
+`&search_mode=${searchMode}` |
|
|
|
|
|
|
|
+`&media_type=TEXT`; |
|
|
|
console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`); |
|
|
|
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const browser = await puppeteer.launch({ |
|
|
|
const browser = await puppeteer.launch({ |
|
|
|
headless: "new", |
|
|
|
headless: true,
|
|
|
|
|
|
|
|
// slowMo: 50,
|
|
|
|
args: [ |
|
|
|
args: [ |
|
|
|
'--disable-blink-features=AutomationControlled', |
|
|
|
'--disable-blink-features=AutomationControlled', |
|
|
|
'--no-sandbox', |
|
|
|
'--no-sandbox', |
|
|
|
@ -32,89 +29,104 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod |
|
|
|
try { |
|
|
|
try { |
|
|
|
const page = await browser.newPage(); |
|
|
|
const page = await browser.newPage(); |
|
|
|
|
|
|
|
|
|
|
|
// Anti-detection headers
|
|
|
|
|
|
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); |
|
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); |
|
|
|
await page.setExtraHTTPHeaders({ |
|
|
|
|
|
|
|
'Accept-Language': 'en-US,en;q=0.9', |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (cookies && cookies.length > 0) { |
|
|
|
if (cookies && cookies.length > 0) { |
|
|
|
console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`); |
|
|
|
console.log(`[Auth] Injecting ${cookies.length} cookies...`); |
|
|
|
try { |
|
|
|
await page.setCookie(...cookies); |
|
|
|
// 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
|
|
|
|
|
|
|
|
await browser.setCookie(...cookies); |
|
|
|
|
|
|
|
console.log("[Auth] Cookies 注入成功"); |
|
|
|
|
|
|
|
// console.log(await browser.cookies());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} catch (cookieError) { |
|
|
|
|
|
|
|
console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
await page.setViewport({ width: 1280, height: 900 }); |
|
|
|
await page.setViewport({ width: 1280, height: 1000 }); |
|
|
|
|
|
|
|
|
|
|
|
// Navigate to search results
|
|
|
|
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); |
|
|
|
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Check if we were redirected to the login page
|
|
|
|
if (page.url().includes('/login')) { |
|
|
|
const currentUrl = page.url(); |
|
|
|
throw new Error("Authentication required: Redirected to login."); |
|
|
|
if (currentUrl.includes('/login')) { |
|
|
|
|
|
|
|
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies."); |
|
|
|
|
|
|
|
throw new Error("Authentication required: Threads search is restricted to logged-in users."); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Wait for the results to start appearing
|
|
|
|
// Wait for initial load
|
|
|
|
try { |
|
|
|
try { |
|
|
|
// Threads search result container or post links
|
|
|
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 }); |
|
|
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 }); |
|
|
|
|
|
|
|
} catch (e) { |
|
|
|
} catch (e) { |
|
|
|
console.warn("[Warning] Results did not load. This might be a login wall or no results found."); |
|
|
|
console.warn("[Warning] Initial posts not found."); |
|
|
|
// Take a screenshot for debugging if needed (local environment only)
|
|
|
|
|
|
|
|
// await page.screenshot({ path: 'search_debug.png' });
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Scroll logic to gather more results
|
|
|
|
/** |
|
|
|
await page.evaluate(async (maxItems) => { |
|
|
|
* The Scrape Logic:
|
|
|
|
await new Promise((resolve) => { |
|
|
|
* 1. Uses MutationObserver to watch for NEW links added to the DOM. |
|
|
|
let totalHeight = 0; |
|
|
|
* 2. Periodically scrolls and waits for the network to fetch more. |
|
|
|
let distance = 500; |
|
|
|
* 3. Collects unique URLs into an Internal Set. |
|
|
|
let timer = setInterval(() => { |
|
|
|
*/ |
|
|
|
let scrollHeight = document.body.scrollHeight; |
|
|
|
const uniqueUrls = await page.evaluate(async (maxItems) => { |
|
|
|
window.scrollBy(0, distance); |
|
|
|
const discovered = new Set(); |
|
|
|
totalHeight += distance; |
|
|
|
|
|
|
|
|
|
|
|
return new Promise((resolve) => { |
|
|
|
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length; |
|
|
|
// Function to extract valid post links from current DOM
|
|
|
|
// Stop if we hit the limit, bottom of page, or a safety cap
|
|
|
|
const grabLinks = () => { |
|
|
|
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) { |
|
|
|
const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); |
|
|
|
clearInterval(timer); |
|
|
|
links.forEach(a => { |
|
|
|
resolve(); |
|
|
|
const href = a.href; |
|
|
|
|
|
|
|
// Filter out noise like /reposts or /replies sub-pages
|
|
|
|
|
|
|
|
if (!href.includes('/reposts') && !href.includes('/replies')) { |
|
|
|
|
|
|
|
// Threads links often have queries, clean them for de-duplication
|
|
|
|
|
|
|
|
const cleanUrl = href.split('?')[0]; |
|
|
|
|
|
|
|
discovered.add(cleanUrl); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Setup observer to catch posts as they are rendered during scroll
|
|
|
|
|
|
|
|
const observer = new MutationObserver(() => { |
|
|
|
|
|
|
|
grabLinks(); |
|
|
|
|
|
|
|
if (discovered.size >= maxItems) { |
|
|
|
|
|
|
|
cleanup(); |
|
|
|
} |
|
|
|
} |
|
|
|
}, 300); |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
}, limit); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Extracting the URLs
|
|
|
|
|
|
|
|
const postUrls = await page.evaluate(() => { |
|
|
|
|
|
|
|
const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); |
|
|
|
|
|
|
|
return links |
|
|
|
|
|
|
|
.map(a => a.href) |
|
|
|
|
|
|
|
// Filter for actual post links and ignore UI elements like repost/share buttons
|
|
|
|
|
|
|
|
.filter(href => { |
|
|
|
|
|
|
|
const isPost = href.includes('/post/'); |
|
|
|
|
|
|
|
const isNotAction = !href.includes('/reposts') && !href.includes('/replies'); |
|
|
|
|
|
|
|
return isPost && isNotAction; |
|
|
|
|
|
|
|
}); |
|
|
|
}); |
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Deduplicate using a Set
|
|
|
|
observer.observe(document.body, { childList: true, subtree: true }); |
|
|
|
const uniqueUrls = [...new Set(postUrls)].slice(0, limit); |
|
|
|
|
|
|
|
|
|
|
|
const cleanup = () => { |
|
|
|
|
|
|
|
observer.disconnect(); |
|
|
|
|
|
|
|
clearInterval(scrollInterval); |
|
|
|
|
|
|
|
resolve(Array.from(discovered)); |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Perform incremental scrolling
|
|
|
|
|
|
|
|
let lastHeight = document.body.scrollHeight; |
|
|
|
|
|
|
|
let scrollAttempts = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const scrollInterval = setInterval(() => { |
|
|
|
|
|
|
|
window.scrollBy(0, 800); |
|
|
|
|
|
|
|
grabLinks(); // Manual grab just in case
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (discovered.size >= maxItems) { |
|
|
|
|
|
|
|
cleanup(); |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Check if we've hit the bottom and no new content is loading
|
|
|
|
|
|
|
|
let newHeight = document.body.scrollHeight; |
|
|
|
|
|
|
|
if (newHeight === lastHeight) { |
|
|
|
|
|
|
|
scrollAttempts++; |
|
|
|
|
|
|
|
if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
lastHeight = newHeight; |
|
|
|
|
|
|
|
scrollAttempts = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}, 500); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Safety timeout: 45 seconds total for scraping
|
|
|
|
|
|
|
|
setTimeout(cleanup, 45000); |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
}, limit); |
|
|
|
|
|
|
|
|
|
|
|
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`); |
|
|
|
const resultUrls = uniqueUrls.slice(0, limit); |
|
|
|
|
|
|
|
console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`); |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
return { |
|
|
|
keyword, |
|
|
|
keyword, |
|
|
|
count: uniqueUrls.length, |
|
|
|
count: resultUrls.length, |
|
|
|
urls: uniqueUrls |
|
|
|
urls: resultUrls |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
} catch (error) { |
|
|
|
} catch (error) { |
|
|
|
|