自動ウエブ内容の更新
このコミットが含まれているのは:
コミット
4cf5bbfcba
|
@ -0,0 +1,18 @@
|
|||
# /etc/crontab: system-wide crontab
|
||||
# Unlike any other crontab you don't have to run the `crontab'
|
||||
# command to install the new version when you edit this file
|
||||
# and files in /etc/cron.d. These files also have username fields,
|
||||
# that none of the other crontabs do.
|
||||
|
||||
SHELL=/bin/sh
|
||||
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
|
||||
|
||||
# Example of job definition:
|
||||
# .---------------- minute (0 - 59)
|
||||
# | .------------- hour (0 - 23)
|
||||
# | | .---------- day of month (1 - 31)
|
||||
# | | | .------- month (1 - 12) OR jan,feb,mar,apr ...
|
||||
# | | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
|
||||
# | | | | |
|
||||
# * * * * * user-name command to be executed
|
||||
0 2 * * 0 root php /path/to/the/kensaku.online/tool/crawler.php
|
|
@ -1,2 +1,99 @@
|
|||
<?php
|
||||
require_once('../helper.php');
|
||||
require_once('../config.php');
|
||||
|
||||
$urls = [];
|
||||
$res = [];
|
||||
mysqli_report(MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT);
|
||||
$mysqli = mysqli_connect(DBHOST, DBUSER, DBPASS, DBNAME);
|
||||
|
||||
if (mysqli_connect_errno()) {
|
||||
printf("Connect failed: %s\n", mysqli_connect_error());
|
||||
exit();
|
||||
}
|
||||
|
||||
if ($sites = mysqli_prepare($mysqli, "SELECT * FROM website")) {
|
||||
mysqli_stmt_execute($sites);
|
||||
mysqli_stmt_bind_result($sites, $wid, $wurl);
|
||||
mysqli_execute($sites);
|
||||
|
||||
while (mysqli_stmt_fetch($sites)) {
|
||||
$urls[$wid] = $wurl;
|
||||
}
|
||||
|
||||
mysqli_stmt_close($sites);
|
||||
}
|
||||
|
||||
foreach ($urls as $k => $v) {
|
||||
$out = shell_exec('curl -s -f -L '.$v.' | grep -Eo "href=\"(/\S+?|https?://\S+?)\"" 2>&1'); // ページですべてのURLを受け取って
|
||||
$out = explode("\n", $out); // array化
|
||||
|
||||
foreach ($out as $i => $o) { // 値の直し
|
||||
// HTMLのパラメートルの消し
|
||||
$tmp = str_replace('href="', '', $o);
|
||||
$tmp = str_replace('"', '', $tmp);
|
||||
// リンクは「/」で始まったら、ホスト名を付けて
|
||||
$tmp = str_starts_with($tmp, '/') ? $v.$tmp : $tmp;
|
||||
if (str_ends_with($tmp, '/')) $tmp = substr($tmp, 0, -1);
|
||||
$out[$i] = $tmp;
|
||||
|
||||
if ($out[$i] == '') unset($out[$i]); // 空だったら、消して
|
||||
else if (!str_starts_with($out[$i], $v)) unset($out[$i]); // 外部リンクの消し
|
||||
}
|
||||
|
||||
$out = array_unique($out); // 複写URLの消し
|
||||
|
||||
foreach ($out as $i => $o) { // HTMLだけを保存したいですので、なければarrayから消します
|
||||
$cres = shell_exec('curl -s -L '.$o.' | grep "<\!DOCTYPE html" 2>&1');
|
||||
if (is_null($cres)) unset($out[$i]);
|
||||
unset($cres);
|
||||
}
|
||||
|
||||
$out = array_values($out); // メモリに優しくなりましょう
|
||||
$ptitle = '';
|
||||
$pbody = '';
|
||||
|
||||
foreach ($out as $i => $o) { // ページタイトル及び内容を受け取って
|
||||
// タイトル
|
||||
$ptitle = shell_exec('curl -s -L '.$o.' | grep "<title>" 2>&1');
|
||||
$ptitle = trim($ptitle);
|
||||
$ptitle = str_replace('<title>', '', $ptitle);
|
||||
$ptitle = str_replace('</title>', '', $ptitle);
|
||||
|
||||
// 内容
|
||||
$pbody = shell_exec('curl -s -L '.$o.' | pandoc -f html -t plain 2>&1');
|
||||
|
||||
$res[] = [
|
||||
'website_id' => $k,
|
||||
'url' => $o,
|
||||
'title' => htmlentities($ptitle),
|
||||
'body' => htmlentities($pbody)
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($res as $k => $v) {
|
||||
$rurl = $v['url'];
|
||||
$rpid = null;
|
||||
|
||||
if ($site = mysqli_prepare($mysqli, "SELECT id FROM website_page WHERE url = ?")) {
|
||||
mysqli_stmt_bind_param($site, "s", $rurl);
|
||||
mysqli_stmt_execute($site);
|
||||
mysqli_stmt_bind_result($site, $wpid);
|
||||
mysqli_execute($site);
|
||||
mysqli_stmt_fetch($site);
|
||||
$rpid = $wpid;
|
||||
$wpid = null;
|
||||
mysqli_stmt_close($site);
|
||||
|
||||
if (is_null($rpid)) {
|
||||
mysqli_query($mysqli, "INSERT INTO `website_page` (website_id, url, title, body) VALUES (".$v['website_id'].", '".$v['url']."', '".$v['title']."', '".$v['body']."');");
|
||||
}
|
||||
else {
|
||||
mysqli_query($mysqli, "UPDATE `website_page` SET title = '".$v['title']."', body = '".$v['body']."' WHERE id = ".$rpid.";");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mysqli_close($mysqli);
|
||||
?>
|
||||
|
|
新しいイシューから参照