You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
30 lines
738 B
30 lines
738 B
const { convert } = require('html-to-text');
|
|
const fs = require('fs');
|
|
const { format } = require('path');
|
|
|
|
const options = {
|
|
// wordwrap: 130,
|
|
selectors:[
|
|
{selector: 'img', format : 'skip'}, // Skip images
|
|
]
|
|
};
|
|
|
|
|
|
const folder='../raw/html';
|
|
const files = fs.readdirSync(folder).filter(file => file.endsWith('.html'));
|
|
console.log(`Found ${files.length} HTML files in ${folder}`);
|
|
|
|
|
|
files.forEach(file => {
|
|
|
|
const filePath = `${folder}/${file}`;
|
|
const html = fs.readFileSync(filePath, 'utf8');
|
|
|
|
const text = convert(html, options);
|
|
|
|
|
|
const outputFilePath = filePath.replaceAll('html', 'txt');
|
|
fs.writeFileSync(outputFilePath, text, 'utf8');
|
|
console.log(`Converted ${file} to ${outputFilePath}`);
|
|
|
|
}); |