Skip to content

Commit 0d58291

Browse files
author
Mauve Signweaver
committed
feat: use website-scraper instead of wget2
1 parent 58793b7 commit 0d58291

File tree

4 files changed

+1030
-63
lines changed

4 files changed

+1030
-63
lines changed

ansible/roles/distributed_press/tasks/main.yml

-7
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,6 @@
2525
groups: www-data
2626
append: yes
2727

28-
- name: "Install wget2"
29-
apt:
30-
pkg:
31-
- wget2
32-
state: latest
33-
update_cache: true
34-
3528
- name: "Install git and ufw"
3629
apt:
3730
pkg:

config/sites.ts

+18-23
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import { NewSite, Site, UpdateSite, SiteStats } from '../api/schemas.js'
22
import { Static } from '@sinclair/typebox'
3+
34
import { Config } from './store.js'
45
import { AbstractLevel } from 'abstract-level'
56
import { ProtocolManager } from '../protocols/index.js'
67
import { Ctx } from '../protocols/interfaces.js'
8+
79
import isValidHostname from 'is-valid-hostname'
810
import createError from 'http-errors'
9-
import { promisify } from 'node:util'
10-
import child_process from 'node:child_process'
11-
import path from 'node:path'
12-
const exec = promisify(child_process.exec)
11+
import scrape from 'website-scraper'
12+
// @ts-expect-error
13+
import SaveToExistingDirectoryPlugin from 'website-scraper-existing-directory'
1314

1415
export class SiteConfigStore extends Config<Static<typeof Site>> {
1516
protocols: ProtocolManager
@@ -34,25 +35,19 @@ export class SiteConfigStore extends Config<Static<typeof Site>> {
3435
return await this.db.put(id, obj).then(() => obj)
3536
}
3637

37-
async clone (siteId: string, filePath: string, ctx?: Ctx): Promise<void> {
38-
const cwd = path.resolve(filePath, '..')
39-
const destination = filePath.split(path.sep).at(-1) as string
40-
41-
await exec(`wget2 \
42-
--random-wait \
43-
--compression=identity,gzip,br \
44-
--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0" \
45-
--local-encoding=UTF-8 \
46-
--mirror \
47-
--page-requisites \
48-
--convert-links \
49-
--adjust-extension \
50-
--no-host-directories \
51-
--directory-prefix=${destination} \
52-
--base=https://${siteId} \
53-
"https://${siteId}"`, { cwd })
54-
55-
await this.sync(siteId, filePath, ctx)
38+
async clone (siteId: string, directory: string, ctx?: Ctx): Promise<void> {
39+
const siteUrl = `https://${siteId}`
40+
41+
await scrape({
42+
plugins: [new SaveToExistingDirectoryPlugin()],
43+
urls: [siteUrl],
44+
directory,
45+
recursive: true,
46+
maxRecursiveDepth: 10,
47+
urlFilter: (url) => url.startsWith(siteUrl)
48+
})
49+
50+
await this.sync(siteId, directory, ctx)
5651
}
5752

5853
async sync (siteId: string, filePath: string, ctx?: Ctx): Promise<void> {

0 commit comments

Comments
 (0)