main
reng 2 weeks ago
parent c3501c1028
commit 03a16df356
  1. 3
      v2/app/src/components/graph.jsx
  2. 6
      v2/scrapper/main.js
  3. 160
      v2/scrapper/search.js

@ -78,6 +78,9 @@ export default function Graph({results}){
if(!points || points.length===0) return;
sendOsc('/clear', JSON.stringify({}));
// return;
const keywords=results.filter((point)=>point?.type==='keyword');

@ -11,7 +11,7 @@ dotenv.config();
const cookies = [
{
name: 'sessionid',
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
value: '64605724719%3AP70RqmwttERST3%3A1%3AAYhGxKgIRQ7XUEx9LXhEzvt1l21d4SzEAmGYeE-D0Q',
domain: '.threads.com',
path:'/',
httpOnly:true,
@ -29,7 +29,7 @@ const cookies = [
},
{
name:'csrftoken',
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
value:'isG68x9tZt73Uuc6nv1m63f4MveFL7Uy',
domain: '.threads.com',
path:'/',
httpOnly:true,
@ -54,7 +54,7 @@ const Keywords=[
const Version="v6";
const DEBUG_MODE=false;
const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG'
const SCRAP_TYPE='KEYWORD'; // 'KEYWORD' or 'TAG'
const CLEAR=true;
const COLLECTION_NAME='data-v4';

@ -1,27 +1,24 @@
import puppeteer from 'puppeteer';
/**
* Threads Search Scraper
* Navigates to the search page, enters a keyword, and extracts post URLs.
* * @param {string} keyword - The search term.
* @param {number} limit - Maximum number of URLs to return.
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
* Optimized Threads Search Scraper
* Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
*/
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') {
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
if (!keyword) {
throw new Error("Please provide a keyword for the search.");
}
const searchUrl = `https://www.threads.net/search?`
+`q=${encodeURIComponent(keyword)}`
+`&search_type=TOP`
+`&limit=${limit}`
+`&search_mode=${searchMode}`
+`&media_type=TEXT`;
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
+ `q=${encodeURIComponent(keyword)}`
+ `&search_type=TOP`
+ `&search_mode=${searchMode}`;
console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);
const browser = await puppeteer.launch({
headless: "new",
headless: true,
// slowMo: 50,
args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
@ -32,89 +29,104 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod
try {
const page = await browser.newPage();
// Anti-detection headers
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9',
});
if (cookies && cookies.length > 0) {
console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
try {
// 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
await browser.setCookie(...cookies);
console.log("[Auth] Cookies 注入成功");
// console.log(await browser.cookies());
} catch (cookieError) {
console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
}
console.log(`[Auth] Injecting ${cookies.length} cookies...`);
await page.setCookie(...cookies);
}
await page.setViewport({ width: 1280, height: 900 });
await page.setViewport({ width: 1280, height: 1000 });
// Navigate to search results
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// Check if we were redirected to the login page
const currentUrl = page.url();
if (currentUrl.includes('/login')) {
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
throw new Error("Authentication required: Threads search is restricted to logged-in users.");
if (page.url().includes('/login')) {
throw new Error("Authentication required: Redirected to login.");
}
// Wait for the results to start appearing
// Wait for initial load
try {
// Threads search result container or post links
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
} catch (e) {
console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
// Take a screenshot for debugging if needed (local environment only)
// await page.screenshot({ path: 'search_debug.png' });
console.warn("[Warning] Initial posts not found.");
}
// Scroll logic to gather more results
await page.evaluate(async (maxItems) => {
await new Promise((resolve) => {
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
// Stop if we hit the limit, bottom of page, or a safety cap
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
clearInterval(timer);
resolve();
}
}, 300);
});
}, limit);
/**
* The Scrape Logic:
* 1. Uses MutationObserver to watch for NEW links added to the DOM.
* 2. Periodically scrolls and waits for the network to fetch more.
* 3. Collects unique URLs into an Internal Set.
*/
const uniqueUrls = await page.evaluate(async (maxItems) => {
const discovered = new Set();
// Extracting the URLs
const postUrls = await page.evaluate(() => {
return new Promise((resolve) => {
// Function to extract valid post links from current DOM
const grabLinks = () => {
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
return links
.map(a => a.href)
// Filter for actual post links and ignore UI elements like repost/share buttons
.filter(href => {
const isPost = href.includes('/post/');
const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
return isPost && isNotAction;
links.forEach(a => {
const href = a.href;
// Filter out noise like /reposts or /replies sub-pages
if (!href.includes('/reposts') && !href.includes('/replies')) {
// Threads links often have queries, clean them for de-duplication
const cleanUrl = href.split('?')[0];
discovered.add(cleanUrl);
}
});
};
// Setup observer to catch posts as they are rendered during scroll
const observer = new MutationObserver(() => {
grabLinks();
if (discovered.size >= maxItems) {
cleanup();
}
});
// Deduplicate using a Set
const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
observer.observe(document.body, { childList: true, subtree: true });
const cleanup = () => {
observer.disconnect();
clearInterval(scrollInterval);
resolve(Array.from(discovered));
};
// Perform incremental scrolling
let lastHeight = document.body.scrollHeight;
let scrollAttempts = 0;
const scrollInterval = setInterval(() => {
window.scrollBy(0, 800);
grabLinks(); // Manual grab just in case
if (discovered.size >= maxItems) {
cleanup();
return;
}
// Check if we've hit the bottom and no new content is loading
let newHeight = document.body.scrollHeight;
if (newHeight === lastHeight) {
scrollAttempts++;
if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
} else {
lastHeight = newHeight;
scrollAttempts = 0;
}
}, 500);
// Safety timeout: 45 seconds total for scraping
setTimeout(cleanup, 45000);
});
}, limit);
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
const resultUrls = uniqueUrls.slice(0, limit);
console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);
return {
keyword,
count: uniqueUrls.length,
urls: uniqueUrls
count: resultUrls.length,
urls: resultUrls
};
} catch (error) {

Loading…
Cancel
Save