1
1
import { NewSite , Site , UpdateSite , SiteStats } from '../api/schemas.js'
2
2
import { Static } from '@sinclair/typebox'
3
+
3
4
import { Config } from './store.js'
4
5
import { AbstractLevel } from 'abstract-level'
5
6
import { ProtocolManager } from '../protocols/index.js'
6
7
import { Ctx } from '../protocols/interfaces.js'
8
+
7
9
import isValidHostname from 'is-valid-hostname'
8
10
import createError from 'http-errors'
9
- import { promisify } from 'node:util'
10
- import child_process from 'node:child_process'
11
- import path from 'node:path'
12
- const exec = promisify ( child_process . exec )
11
+ import scrape from 'website-scraper'
12
+ // @ts -expect-error
13
+ import SaveToExistingDirectoryPlugin from 'website-scraper-existing-directory'
13
14
14
15
export class SiteConfigStore extends Config < Static < typeof Site > > {
15
16
protocols : ProtocolManager
@@ -34,25 +35,19 @@ export class SiteConfigStore extends Config<Static<typeof Site>> {
34
35
return await this . db . put ( id , obj ) . then ( ( ) => obj )
35
36
}
36
37
37
- async clone ( siteId : string , filePath : string , ctx ?: Ctx ) : Promise < void > {
38
- const cwd = path . resolve ( filePath , '..' )
39
- const destination = filePath . split ( path . sep ) . at ( - 1 ) as string
40
-
41
- await exec ( `wget2 \
42
- --random-wait \
43
- --compression=identity,gzip,br \
44
- --user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0" \
45
- --local-encoding=UTF-8 \
46
- --mirror \
47
- --page-requisites \
48
- --convert-links \
49
- --adjust-extension \
50
- --no-host-directories \
51
- --directory-prefix=${ destination } \
52
- --base=https://${ siteId } \
53
- "https://${ siteId } "` , { cwd } )
54
-
55
- await this . sync ( siteId , filePath , ctx )
38
+ async clone ( siteId : string , directory : string , ctx ?: Ctx ) : Promise < void > {
39
+ const siteUrl = `https://${ siteId } `
40
+
41
+ await scrape ( {
42
+ plugins : [ new SaveToExistingDirectoryPlugin ( ) ] ,
43
+ urls : [ siteUrl ] ,
44
+ directory,
45
+ recursive : true ,
46
+ maxRecursiveDepth : 10 ,
47
+ urlFilter : ( url ) => url . startsWith ( siteUrl )
48
+ } )
49
+
50
+ await this . sync ( siteId , directory , ctx )
56
51
}
57
52
58
53
async sync ( siteId : string , filePath : string , ctx ?: Ctx ) : Promise < void > {
0 commit comments