You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
6.9 KiB
199 lines
6.9 KiB
import { jsonToAgent, parseTest } from "./agent.js";
|
|
import { getThread } from "./scrapper.js";
|
|
import { searchThreads } from "./search.js";
|
|
import { writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
|
|
import dotenv from 'dotenv';
|
|
import { processData } from "./embeddings.js";
|
|
dotenv.config();
|
|
|
|
|
|
const cookies = [
|
|
{
|
|
name: 'sessionid',
|
|
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
|
|
domain: '.threads.com',
|
|
path:'/',
|
|
httpOnly:true,
|
|
secure:true,
|
|
expires: Math.floor(Date.now()/1000)+60*60*24*365,
|
|
},
|
|
{
|
|
name: 'ds_user_id',
|
|
value: '64605724719',
|
|
domain: '.threads.com',
|
|
path:'/',
|
|
httpOnly:true,
|
|
secure:true,
|
|
expires: Math.floor(Date.now()/1000)+60*60*24*365,
|
|
},
|
|
{
|
|
name:'csrftoken',
|
|
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
|
|
domain: '.threads.com',
|
|
path:'/',
|
|
httpOnly:true,
|
|
secure:true,
|
|
expires: Math.floor(Date.now()/1000)+60*60*24*365,
|
|
}
|
|
];
|
|
|
|
const Keywords=[
|
|
"學貸 房租 焦慮","獨立 失敗","尷尬 年齡","還沒買房","卡住 職涯",
|
|
"躺平 世代","家境 差距","文組 標籤","覺醒 厭世","年輕",
|
|
"慣老闆 權益","行政 刁難","學生會 被架空","職場 不平等","家長會 決定",
|
|
"租屋 系統 惡意","升學 制度 不公","系統性 壓榨","黑箱 決策","系統 漏洞","校規", "SOP","行政 裁量權",
|
|
"租屋 格局 壓迫","辦公室 權力","校園 空間 設計","宿舍 小 隱私","空間 不舒服",
|
|
"會議室 借不到","圖書館 規定","公共空間 誰的","校車 收費 影響","畢業 退場",
|
|
"人生 妥協","我的 決定","邊界 被侵犯","下班 不讀","工作 生活 平衡",
|
|
"參與 沒用","開會 敷衍","假 民主","意見 被忽略","提案 被擋","連署 實質",
|
|
"迷因 轉化 焦慮","幽默 反抗","厭世 創作","負面情緒 詩意","話語權 奪回",
|
|
"歸屬感 飄","租來的 人生","網路 社群 溫暖","家鄉 不屬於","靈魂 避難所"
|
|
]
|
|
|
|
|
|
const Version="v3";
|
|
const DEBUG_MODE=false;
|
|
const SCRAP_TYPE='KEYWORD'; // 'KEYWORD' or 'TAG'
|
|
const CLEAR=true;
|
|
|
|
async function step1(){
|
|
// const chooseKeywords=['人生 妥協','邊界 被侵犯','卡住 職涯','尷尬 年齡','厭世 創作'];
|
|
const chooseThemse=['職涯','00後整頓職場','生活方式','長大後才懂的事','privilege'];
|
|
const chooseKeywords=['成年 限制 學貸 獨立','學貸 房租 焦慮','職場 權利 privilege','職涯 嚮往','門禁 自由 限制','人生 妥協 期待','人生 自主'];
|
|
|
|
let select=SCRAP_TYPE==='KEYWORD'? chooseKeywords : chooseThemse;
|
|
|
|
// const keyword="工作 生活 平衡";
|
|
for(const keyword of select){
|
|
if(DEBUG_MODE){
|
|
if(keyword!==chooseKeywords[0] && SCRAP_TYPE==='KEYWORD') continue;
|
|
if(keyword!==chooseThemse[0] && SCRAP_TYPE==='TAG') continue;
|
|
}
|
|
// const searchResults = await searchThreads(keyword, 50, cookies, 'TAG');
|
|
const searchResults = await searchThreads(keyword, 50, cookies, SCRAP_TYPE);
|
|
console.log(JSON.stringify(searchResults));
|
|
|
|
for(const url of searchResults.urls){
|
|
try{
|
|
const threadItems=await getThread(url, cookies);
|
|
// console.log(JSON.stringify(threadItems));
|
|
|
|
// save to filesystem
|
|
const fileName=url.split("/post/")[1].split("/")[0];
|
|
|
|
// check if directory exists
|
|
const dir = `scrapped/${keyword}`;
|
|
if (!existsSync(dir)){
|
|
mkdirSync(dir, { recursive: true });
|
|
}
|
|
|
|
writeFileSync(`scrapped/${keyword}/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
|
|
} catch(err){
|
|
console.error("Error processing", url, ":", err);
|
|
}
|
|
}
|
|
}
|
|
|
|
// const threadUrl="https://www.threads.com/@bobolove0828/post/DQ63pgpklmi";
|
|
// const threadItems=await getThread(threadUrl, cookies);
|
|
// console.log(JSON.stringify(threadItems));
|
|
// const fileName=threadUrl.split("/post/")[1].split("/")[0];
|
|
// writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
|
|
}
|
|
|
|
async function step2(){
|
|
|
|
// list folders
|
|
const folders = readdirSync('./scrapped');
|
|
console.log("Folders in raw folder:", folders);
|
|
|
|
let count=2;//folders.length;
|
|
for(var i=0;i<folders.length;i++){
|
|
|
|
if(DEBUG_MODE && i>=count) break;
|
|
|
|
const folder=folders[i];
|
|
|
|
// check is folder
|
|
const isFolder = statSync(`./scrapped/${folder}`).isDirectory();
|
|
if (!isFolder) {
|
|
console.log(`Folder ${folder} is empty or does not exist.`);
|
|
continue;
|
|
}
|
|
|
|
const files = readdirSync(`./scrapped/${folder}`);
|
|
console.log(`Files in folder ${folder}:`, files);
|
|
|
|
for(var index in files){
|
|
// files?.forEach(async (file, index) => {
|
|
const file=files[index];
|
|
|
|
if(DEBUG_MODE && index>0) return; // for testing, process only first file
|
|
|
|
try{
|
|
const data=await jsonToAgent(`./scrapped/${folder}/${file}`, 'agent_v2.txt');
|
|
// console.log("Agent response for folder", folder, ":", data);
|
|
|
|
// save to file
|
|
const outputFilePath = `./processed_${Version}/${folder}/${file}`;
|
|
// ensure directory exists
|
|
if (!existsSync(`./processed_${Version}/${folder}`)){
|
|
mkdirSync(`./processed_${Version}/${folder}`, { recursive: true });
|
|
}
|
|
|
|
writeFileSync(outputFilePath, JSON.stringify(data, null, 2));
|
|
console.log("Saved agent response to", outputFilePath);
|
|
|
|
}catch(err){
|
|
console.error("Error processing agent for folder", folder, ":", err);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
async function step3(){
|
|
const folders = readdirSync(`./processed_${Version}`);
|
|
console.log("Folders in raw folder:", folders);
|
|
|
|
if(CLEAR) await clearCollection(collection);
|
|
|
|
for(const folder of folders){
|
|
// check is folder
|
|
const isFolder = statSync(`./processed_${Version}/${folder}`).isDirectory();
|
|
if (!isFolder) {
|
|
console.log(`Folder ${folder} is empty or does not exist.`);
|
|
continue;
|
|
}
|
|
|
|
const files = readdirSync(`./processed_${Version}/${folder}`);
|
|
console.log(`Files in folder ${folder}:`, files);
|
|
|
|
for(const index in files){
|
|
const file=files[index];
|
|
|
|
try{
|
|
await processData(`./processed_${Version}/${folder}/${file}`, false, `data-v3`);
|
|
}catch(err){
|
|
console.error("Error processing embeddings for folder", folder, ":", err);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
async function main(){
|
|
|
|
// await step1();
|
|
// await getThread('https://www.threads.com/@pytteliten_/post/DJpeh8BoO3a', cookies);
|
|
|
|
await step2();
|
|
// await parseTest('./processed_v3/00後整頓職場/DBEB0ACzF4j.json');
|
|
// await step3();
|
|
|
|
}
|
|
|
|
main(); |