|
| 1 | +<?php |
| 2 | +// This file is part of Moodle - http://moodle.org/ |
| 3 | +// |
| 4 | +// Moodle is free software: you can redistribute it and/or modify |
| 5 | +// it under the terms of the GNU General Public License as published by |
| 6 | +// the Free Software Foundation, either version 3 of the License, or |
| 7 | +// (at your option) any later version. |
| 8 | +// |
| 9 | +// Moodle is distributed in the hope that it will be useful, |
| 10 | +// but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | +// GNU General Public License for more details. |
| 13 | +// |
| 14 | +// You should have received a copy of the GNU General Public License |
| 15 | +// along with Moodle. If not, see <http://www.gnu.org/licenses/>. |
| 16 | + |
| 17 | +/** |
| 18 | + * Document representation. |
| 19 | + * |
| 20 | + * @package search_postgresfulltext |
| 21 | + * @copyright 2017 Catalyst IT |
| 22 | + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later |
| 23 | + */ |
| 24 | + |
| 25 | +namespace search_postgresfulltext; |
| 26 | + |
| 27 | +defined('MOODLE_INTERNAL') || die(); |
| 28 | + |
| 29 | + |
| 30 | +require_once($CFG->libdir.'/filelib.php'); |
| 31 | + |
| 32 | +/** |
| 33 | + * Represents a document to index. |
| 34 | + * |
| 35 | + * @copyright 2017 Catalyst IT |
| 36 | + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later |
| 37 | + */ |
| 38 | +class document extends \core_search\document { |
| 39 | + |
| 40 | + /** |
| 41 | + * @var config stdClass |
| 42 | + */ |
| 43 | + private $config = null; |
| 44 | + |
| 45 | + /** |
| 46 | + * Constructor |
| 47 | + * |
| 48 | + * @param integer $itemid |
| 49 | + * @param string $componentname |
| 50 | + * @param integer $areaname |
| 51 | + */ |
| 52 | + public function __construct($itemid, $componentname, $areaname) { |
| 53 | + parent::__construct($itemid, $componentname, $areaname); |
| 54 | + $this->config = get_config('search_postgresfulltext'); |
| 55 | + } |
| 56 | + |
| 57 | + /** |
| 58 | + * Overwritten to use markdown format as we use markdown for highlighting. |
| 59 | + * |
| 60 | + * @return int |
| 61 | + */ |
| 62 | + protected function get_text_format() { |
| 63 | + return FORMAT_HTML; |
| 64 | + } |
| 65 | + |
| 66 | + /** |
| 67 | + * Formats a text string coming from the search engine. |
| 68 | + * |
| 69 | + * @param string $text Text to format |
| 70 | + * @return string HTML text to be renderer |
| 71 | + */ |
| 72 | + protected function format_text($text) { |
| 73 | + // Since we allow output for highlighting, we need to encode html entities. |
| 74 | + // This ensures plaintext html chars don't become valid html. |
| 75 | + $out = s($text); |
| 76 | + |
| 77 | + $startcount = 0; |
| 78 | + $endcount = 0; |
| 79 | + |
| 80 | + // Remove end/start pairs that span a few common seperation characters. Allows us to highlight phrases instead of words. |
| 81 | + $regex = '|'.engine::HIGHLIGHT_END.'([ .,-]{0,3})'.engine::HIGHLIGHT_START.'|'; |
| 82 | + $out = preg_replace($regex, '$1', $out); |
| 83 | + |
| 84 | + // Now replace our start and end highlight markers. |
| 85 | + $out = str_replace(engine::HIGHLIGHT_START, '<span class="highlight">', $out, $startcount); |
| 86 | + $out = str_replace(engine::HIGHLIGHT_END, '</span>', $out, $endcount); |
| 87 | + |
| 88 | + // This makes sure any highlight tags are balanced, incase truncation or the highlight text contained our markers. |
| 89 | + while ($startcount > $endcount) { |
| 90 | + $out .= '</span>'; |
| 91 | + $endcount++; |
| 92 | + } |
| 93 | + while ($startcount < $endcount) { |
| 94 | + $out = '<span class="highlight">' . $out; |
| 95 | + $endcount++; |
| 96 | + } |
| 97 | + |
| 98 | + return parent::format_text($out); |
| 99 | + } |
| 100 | + |
| 101 | + |
| 102 | + /** |
| 103 | + * Export the data for the given file in relation to this document. |
| 104 | + * |
| 105 | + * @param \stored_file $file The stored file we are talking about. |
| 106 | + * @return array |
| 107 | + */ |
| 108 | + public function export_file_for_engine($file) { |
| 109 | + $data = array(); |
| 110 | + // Going to append the fileid to give it a unique id. |
| 111 | + $data['docid'] = $this->data['id']; |
| 112 | + $data['fileid'] = $file->get_id(); |
| 113 | + $data['filecontenthash'] = $file->get_contenthash(); |
| 114 | + $data['title'] = $file->get_filename(); |
| 115 | + $data['modified'] = $file->get_timemodified(); |
| 116 | + $data['text'] = $this->extract_text_from_file($file); |
| 117 | + |
| 118 | + return $data; |
| 119 | + } |
| 120 | + |
| 121 | + /** |
| 122 | + * Extract text from a file using Apache Tika |
| 123 | + * |
| 124 | + * @param \storedfile $storedfile |
| 125 | + * @return string|bool |
| 126 | + */ |
| 127 | + private function extract_text_from_file($storedfile) { |
| 128 | + if (empty($this->config->tikaurl) || !$this->config->fileindexing) { |
| 129 | + return false; |
| 130 | + } |
| 131 | + |
| 132 | + $curl = new \Curl(); |
| 133 | + $url = $this->config->tikaurl."/tika/form"; |
| 134 | + $text = $curl->post($url, array("upload" => $storedfile)); |
| 135 | + |
| 136 | + if ($curl->info['http_code'] != 200) { |
| 137 | + return false; |
| 138 | + } |
| 139 | + |
| 140 | + return $text; |
| 141 | + } |
| 142 | +} |
0 commit comments