Skip to content

Commit e7f557e

Browse files
Tombrendanheywood
Tom
authored andcommitted
issue108: Priority does not flow down to child nodes (#109)
* issue108: Priority does not flow down to child nodes #108 Add parent priority to child nodes when marking for crawl * enhancement: add priority to direct child nodes #108 This enhancement aims to flow priority down to direct child nodes only. Through the implementation of node levels and a level check when marking a node to be crawled, we only assign a parent priority to a child node if it is a direct ancestor of the original node. This will prevent passing priority recursively and if, for example, a child node is a top level node, filtering the priority to effectively all nodes, which is undesirable behaviour. * fix: Add priority check to node * fix: Remove extra table closing tag in install.xml * style: remove addition line in upgrade script * tests: Add unit tests for issue #108 * tests: Add priority provider to test all possible parent priorities
1 parent 97b3799 commit e7f557e

File tree

6 files changed

+132
-6
lines changed

6 files changed

+132
-6
lines changed

classes/robot/crawler.php

+26-2
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,20 @@ public function get_queue_size() {
306306
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the
307307
* URL is invalid or excluded.
308308
*/
309-
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) {
309+
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT,
310+
$level = TOOL_CRAWLER_NODE_LEVEL_PARENT) {
310311

311312
global $DB, $CFG;
312313

313314
$url = $this->absolute_url($baseurl, $url);
314315

316+
// Strip priority from indirect child nodes. Only parent and direct children
317+
// of parent nodes have priority applied to avoid recursively applying priority
318+
// to all ancestors of a parent node.
319+
if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) {
320+
$priority = TOOL_CRAWLER_PRIORITY_DEFAULT;
321+
}
322+
315323
// Filter out non http protocols like mailto:[email protected] etc.
316324
$bits = parse_url($url);
317325
if (array_key_exists('scheme', $bits)
@@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
420428
$node->external = self::is_external($url);
421429
$node->needscrawl = time();
422430
$node->priority = $priority;
431+
$node->level = $level;
423432

424433
if (isset($courseid)) {
425434
$node->courseid = $courseid;
@@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
438447
$node->priority = $priority;
439448
$needsupdating = true;
440449
}
450+
if ($node->level != $level) {
451+
// Set the level again, in case this node has been seen again at a different
452+
// level, to avoid reprocessing.
453+
$node->level = $level;
454+
}
441455
if (isset($courseid)) {
442456
$node->courseid = $courseid;
443457
$needsupdating = true;
@@ -901,8 +915,18 @@ private function link_from_node_to_url($from, $url, $text, $idattr) {
901915

902916
global $DB;
903917

918+
// Ascertain the correct node level based on parent node level.
919+
if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) {
920+
$level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD;
921+
} else {
922+
$level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD;
923+
}
924+
925+
$priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT;
926+
$courseid = isset($from->courseid) ? $from->courseid : null;
927+
904928
// Add the node URL to the queue.
905-
$to = $this->mark_for_crawl($from->url, $url);
929+
$to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level);
906930
if ($to === false) {
907931
return false;
908932
}

constants.php

+9
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,12 @@
9393
define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0);
9494
define('TOOL_CRAWLER_PRIORITY_NORMAL', 50);
9595
define('TOOL_CRAWLER_PRIORITY_HIGH', 100);
96+
97+
/**
98+
* Node level assigned to each node based on whether it is the parent node, or
99+
* a child node discovered within a parent when crawling, or any child of a child
100+
* node (or even further removed).
101+
*/
102+
define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2);
103+
define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1);
104+
define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0);

db/install.xml

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
<FIELD NAME="httpmsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
2828
<FIELD NAME="errormsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
2929
<FIELD NAME="priority" TYPE="int" LENGTH="10" NOTNULL="false" SEQUENCE="false" DEFAULT="0"/>
30+
<FIELD NAME="level" TYPE="int" LENGTH="1" NOTNULL="false" DEFAULT="2" SEQUENCE="false" COMMENT="Node level, 2 for parent node, 1 for direct child and 0 for subsequent children."/>
3031
</FIELDS>
3132
<KEYS>
3233
<KEY NAME="primary" TYPE="primary" FIELDS="id"/>

db/upgrade.php

+15
Original file line numberDiff line numberDiff line change
@@ -94,5 +94,20 @@ function xmldb_tool_crawler_upgrade($oldversion) {
9494
upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler');
9595
}
9696

97+
if ($oldversion < 2020012300) {
98+
99+
// Define field level to be added to tool_crawler_url.
100+
$table = new xmldb_table('tool_crawler_url');
101+
$field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority');
102+
103+
// Conditionally launch add field level.
104+
if (!$dbman->field_exists($table, $field)) {
105+
$dbman->add_field($table, $field);
106+
}
107+
108+
// Crawler savepoint reached.
109+
upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler');
110+
}
111+
97112
return true;
98113
}

tests/phpunit/robot_crawler_test.php

+79-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
defined('MOODLE_INTERNAL') || die('Direct access to this script is forbidden');
2828

2929
require_once(__DIR__ . '/../../locallib.php');
30+
require_once(__DIR__ . '/../../constants.php');
3031

3132
/**
3233
* Unit tests for link crawler robot
@@ -295,6 +296,7 @@ public function test_should_be_excluded() {
295296
$node->contents = $page . $linktoexclude;
296297
$node->url = $url;
297298
$node->id = $insertid;
299+
$node->level = TOOL_CRAWLER_NODE_LEVEL_PARENT;
298300

299301
$this->resetAfterTest(true);
300302

@@ -310,6 +312,81 @@ public function test_should_be_excluded() {
310312
self::assertFalse($found);
311313
}
312314

313-
}
314-
315+
/**
316+
* Priority provider.
317+
*
318+
* @return array of potential crawler priority codes.
319+
*/
320+
public function priority_provider() {
321+
return [
322+
['high' => TOOL_CRAWLER_PRIORITY_HIGH],
323+
['normal' => TOOL_CRAWLER_PRIORITY_NORMAL],
324+
['default' => TOOL_CRAWLER_PRIORITY_DEFAULT]
325+
];
326+
}
315327

328+
/**
329+
* @dataProvider priority_provider
330+
*
331+
* Test for issue #108 - passing node crawl priority to child nodes when parsing html.
332+
*/
333+
public function test_parse_html_priority_inheritance($parentpriority) {
334+
global $CFG, $DB;
335+
336+
$parentlocalurl = 'course/view.php?id=1&section=2';
337+
$directchildlocalurl = 'mod/book/view.php?id=7';
338+
$indirectchildexternalurl = 'http://someexternalsite.net.au';
339+
340+
// Internal parent node.
341+
$node = $this->robot->mark_for_crawl($CFG->wwwroot, $parentlocalurl, 1, $parentpriority);
342+
$node->httpcode = 200;
343+
$node->mimetype = 'text/html';
344+
$node->external = 0;
345+
$node->contents = <<<HTML
346+
<!doctype html>
347+
<html>
348+
<head>
349+
<meta charset="utf-8"/>
350+
<title>Test title</title>
351+
</head>
352+
<body class="course-1">
353+
<a href="$CFG->wwwroot/$directchildlocalurl">Direct child node</a>
354+
</body>
355+
</html>
356+
HTML;
357+
// Parse the parent node, to create the direct child node.
358+
$parentnode = $this->robot->parse_html($node, $node->external);
359+
360+
// Internal node direct child.
361+
$url = new moodle_url('/' . $directchildlocalurl);
362+
$node = $DB->get_record('tool_crawler_url', array('url' => $url->raw_out()) );
363+
$node->url = $CFG->wwwroot.'/'.$directchildlocalurl;
364+
$node->httpcode = 200;
365+
$node->mimetype = 'text/html';
366+
$node->external = 0;
367+
$node->contents = <<<HTML
368+
<!doctype html>
369+
<html>
370+
<head>
371+
<meta charset="utf-8"/>
372+
<title>Test title</title>
373+
</head>
374+
<body class="course-1">
375+
<a href="$indirectchildexternalurl">Indirect child node</a>
376+
</body>
377+
</html>
378+
HTML;
379+
// Parse the direct child, to create the indirect child node.
380+
$directchildnode = $this->robot->parse_html($node, $node->external);
381+
$indirectchildnode = $DB->get_record('tool_crawler_url', ['url' => $indirectchildexternalurl]);
382+
383+
// Direct child nodes should inherit priority from parent node (super node).
384+
$this->assertEquals($parentnode->priority, $directchildnode->priority);
385+
// Indirect child nodes should not inherit priority from parent node (super node).
386+
$this->assertGreaterThanOrEqual($indirectchildnode->priority, $parentnode->priority);
387+
// Indirect child nodes should not inherit priority from direct child node.
388+
$this->assertGreaterThanOrEqual($indirectchildnode->priority, $directchildnode->priority);
389+
// Indirect child nodes should not be able to have a high priority.
390+
$this->assertLessThan(TOOL_CRAWLER_PRIORITY_HIGH, $indirectchildnode->priority);
391+
}
392+
}

version.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
defined('MOODLE_INTERNAL') || die();
2828

2929

30-
$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
31-
$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
30+
$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
31+
$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
3232
$plugin->requires = 2016021800; // Requires this Moodle version.
3333
$plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place.
3434
$plugin->maturity = MATURITY_STABLE;

0 commit comments

Comments
 (0)