Skip to content

Commit 90bc4ab

Browse files
authored
Add index to tool_crawler_url url field catalyst#113 (catalyst#114)
* Add required persistent API for url table and add urlhash field and index - Add persistent API to each insert and update tool_crawler_url function - rename reserved database keywords - Change queries to select on urlhash, not url, to use new index - Use a shared hash function to hash urls - can update in a single place - Avoid divide by zero warning when we have no processed urls
1 parent 18ab64b commit 90bc4ab

11 files changed

+457
-138
lines changed

classes/local/url.php

+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
<?php
2+
// This file is part of Moodle - http://moodle.org/
3+
//
4+
// Moodle is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// Moodle is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU General Public License
15+
// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16+
17+
/**
18+
* url class
19+
* defines the model for a url record, for CRUD operations on the url table
20+
*
21+
* @package tool_crawler
22+
* @author Kristian Ringer <[email protected]>
23+
* @copyright 2020 Catalyst IT
24+
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25+
*/
26+
27+
namespace tool_crawler\local;
28+
29+
use tool_crawler\robot\crawler;
30+
31+
defined('MOODLE_INTERNAL') || die();
32+
33+
/**
34+
* url class.
35+
*
36+
* @package tool_crawler
37+
* @author Kristian Ringer <[email protected]>
38+
* @copyright 2020 Catalyst IT
39+
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
40+
*/
41+
class url extends \core\persistent {
42+
/** Table name for the persistent. */
43+
const TABLE = 'tool_crawler_url';
44+
/**
45+
* Return the definition of the properties of this model.
46+
*
47+
* @return array
48+
*/
49+
protected static function define_properties() {
50+
return array(
51+
'url' => array(
52+
'type' => PARAM_TEXT,
53+
),
54+
'urlhash' => array(
55+
'type' => PARAM_TEXT,
56+
),
57+
'externalurl' => array(
58+
'type' => PARAM_TEXT,
59+
'null' => NULL_ALLOWED,
60+
'default' => null,
61+
),
62+
'lastcrawled' => array(
63+
'type' => PARAM_INT,
64+
'null' => NULL_ALLOWED,
65+
'default' => null,
66+
),
67+
'needscrawl' => array(
68+
'type' => PARAM_INT,
69+
'null' => NULL_ALLOWED,
70+
'default' => null,
71+
),
72+
'httpcode' => array(
73+
'type' => PARAM_TEXT,
74+
'null' => NULL_ALLOWED,
75+
'default' => null,
76+
),
77+
'mimetype' => array(
78+
'type' => PARAM_TEXT,
79+
'null' => NULL_ALLOWED,
80+
'default' => null,
81+
),
82+
'title' => array(
83+
'type' => PARAM_TEXT,
84+
'null' => NULL_ALLOWED,
85+
'default' => null,
86+
),
87+
'downloadduration' => array(
88+
'type' => PARAM_FLOAT,
89+
'null' => NULL_ALLOWED,
90+
'default' => null,
91+
),
92+
'filesize' => array(
93+
'type' => PARAM_INT,
94+
'null' => NULL_ALLOWED,
95+
'default' => null,
96+
),
97+
'filesizestatus' => array(
98+
'type' => PARAM_INT,
99+
'null' => NULL_ALLOWED,
100+
'default' => null,
101+
),
102+
'redirect' => array(
103+
'type' => PARAM_TEXT,
104+
'null' => NULL_ALLOWED,
105+
'default' => null,
106+
),
107+
'courseid' => array(
108+
'type' => PARAM_INT,
109+
'null' => NULL_ALLOWED,
110+
'default' => null,
111+
),
112+
'contextid' => array(
113+
'type' => PARAM_INT,
114+
'null' => NULL_ALLOWED,
115+
'default' => null,
116+
),
117+
'cmid' => array(
118+
'type' => PARAM_INT,
119+
'null' => NULL_ALLOWED,
120+
'default' => null,
121+
),
122+
'ignoreduserid' => array(
123+
'type' => PARAM_INT,
124+
'null' => NULL_ALLOWED,
125+
'default' => null,
126+
),
127+
'ignoredtime' => array(
128+
'type' => PARAM_INT,
129+
'null' => NULL_ALLOWED,
130+
'default' => null,
131+
),
132+
'httpmsg' => array(
133+
'type' => PARAM_TEXT,
134+
'null' => NULL_ALLOWED,
135+
'default' => null,
136+
),
137+
'errormsg' => array(
138+
'type' => PARAM_TEXT,
139+
'null' => NULL_ALLOWED,
140+
'default' => null,
141+
),
142+
'priority' => array(
143+
'type' => PARAM_INT,
144+
'null' => NULL_ALLOWED,
145+
'default' => '0',
146+
),
147+
'urllevel' => array(
148+
'type' => PARAM_INT,
149+
'null' => NULL_ALLOWED,
150+
'default' => '2',
151+
),
152+
);
153+
}
154+
155+
/**
156+
* Create the hashed field before inserting or updating a record
157+
* This runs as the persistent object is created.
158+
*/
159+
protected function before_validate() {
160+
$url = $this->get('url');
161+
$this->set('urlhash', self::hash_url($url));
162+
}
163+
164+
/**
165+
* Reset a node to be recrawled
166+
*
167+
* @param integer $nodeid node id
168+
*/
169+
public static function reset_for_recrawl($nodeid) {
170+
171+
global $DB;
172+
173+
if ($node = new url($nodeid)) {
174+
175+
$time = crawler::get_config()->crawlstart;
176+
177+
// Mark all nodes that link to this as needing a recrawl.
178+
if ($DB->get_dbfamily() == 'mysql') {
179+
$DB->execute("UPDATE {tool_crawler_url} u
180+
INNER JOIN {tool_crawler_edge} e ON e.a = u.id
181+
SET needscrawl = ?,
182+
lastcrawled = null,
183+
priority = ?
184+
WHERE e.b = ?", [$time, TOOL_CRAWLER_PRIORITY_HIGH, $nodeid]);
185+
} else {
186+
$DB->execute("UPDATE {tool_crawler_url} u
187+
SET needscrawl = ?,
188+
lastcrawled = null,
189+
priority = ?
190+
FROM {tool_crawler_edge} e
191+
WHERE e.a = u.id
192+
AND e.b = ?", [$time, TOOL_CRAWLER_PRIORITY_HIGH, $nodeid]);
193+
}
194+
// Delete all edges that point to this node.
195+
$DB->delete_records('tool_crawler_edge', ['b' => $nodeid]);
196+
// Delete the 'to' node as it may be completely wrong.
197+
$DB->delete_records('tool_crawler_url', array('id' => $nodeid) );
198+
}
199+
}
200+
201+
/**
202+
* Many URLs are in the queue now (more will probably be added)
203+
*
204+
* @return int size of queue
205+
*/
206+
public function get_queue_size() {
207+
global $DB;
208+
209+
return $DB->get_field_sql("
210+
SELECT COUNT(*)
211+
FROM {tool_crawler_url}
212+
WHERE lastcrawled IS NULL
213+
OR lastcrawled < needscrawl");
214+
}
215+
216+
/**
217+
* How many URLs have been processed off the queue
218+
*
219+
* @return int size of processes list
220+
*/
221+
public function get_processed() {
222+
global $DB;
223+
224+
return $DB->get_field_sql("
225+
SELECT COUNT(*)
226+
FROM {tool_crawler_url}
227+
WHERE lastcrawled >= ?",
228+
array(crawler::get_config()->crawlstart));
229+
}
230+
231+
/**
232+
* Hash a url
233+
* @param $url string the url to hash
234+
* @return string the hashed url
235+
*/
236+
public static function hash_url($url) {
237+
return sha1($url);
238+
}
239+
}

0 commit comments

Comments
 (0)