2022-06-01 05:29:23 +09:00
|
|
|
<?php
|
2022-06-01 22:23:51 +09:00
|
|
|
require_once('../helper.php');
|
|
|
|
require_once('../config.php');
|
|
|
|
|
|
|
|
$urls = [];
|
|
|
|
$res = [];
|
|
|
|
mysqli_report(MYSQLI_REPORT_ERROR | MYSQLI_REPORT_STRICT);
|
|
|
|
$mysqli = mysqli_connect(DBHOST, DBUSER, DBPASS, DBNAME);
|
|
|
|
|
|
|
|
if (mysqli_connect_errno()) {
|
|
|
|
printf("Connect failed: %s\n", mysqli_connect_error());
|
|
|
|
exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($sites = mysqli_prepare($mysqli, "SELECT * FROM website")) {
|
|
|
|
mysqli_stmt_execute($sites);
|
|
|
|
mysqli_stmt_bind_result($sites, $wid, $wurl);
|
|
|
|
mysqli_execute($sites);
|
|
|
|
|
|
|
|
while (mysqli_stmt_fetch($sites)) {
|
|
|
|
$urls[$wid] = $wurl;
|
|
|
|
}
|
|
|
|
|
|
|
|
mysqli_stmt_close($sites);
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach ($urls as $k => $v) {
|
|
|
|
$out = shell_exec('curl -s -f -L '.$v.' | grep -Eo "href=\"(/\S+?|https?://\S+?)\"" 2>&1'); // ページですべてのURLを受け取って
|
|
|
|
$out = explode("\n", $out); // array化
|
|
|
|
|
|
|
|
foreach ($out as $i => $o) { // 値の直し
|
|
|
|
// HTMLのパラメートルの消し
|
|
|
|
$tmp = str_replace('href="', '', $o);
|
|
|
|
$tmp = str_replace('"', '', $tmp);
|
|
|
|
// リンクは「/」で始まったら、ホスト名を付けて
|
|
|
|
$tmp = str_starts_with($tmp, '/') ? $v.$tmp : $tmp;
|
|
|
|
if (str_ends_with($tmp, '/')) $tmp = substr($tmp, 0, -1);
|
|
|
|
$out[$i] = $tmp;
|
|
|
|
|
|
|
|
if ($out[$i] == '') unset($out[$i]); // 空だったら、消して
|
|
|
|
else if (!str_starts_with($out[$i], $v)) unset($out[$i]); // 外部リンクの消し
|
|
|
|
}
|
|
|
|
|
|
|
|
$out = array_unique($out); // 複写URLの消し
|
|
|
|
|
|
|
|
foreach ($out as $i => $o) { // HTMLだけを保存したいですので、なければarrayから消します
|
2022-06-05 20:20:01 +09:00
|
|
|
if (strpos(execcurl($o, 'cout', false, false, false), '<!DOCTYPE html') === false) unset($out[$i]);
|
2022-06-01 22:23:51 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
$out = array_values($out); // メモリに優しくなりましょう
|
|
|
|
$ptitle = '';
|
|
|
|
$pbody = '';
|
|
|
|
|
|
|
|
foreach ($out as $i => $o) { // ページタイトル及び内容を受け取って
|
|
|
|
// タイトル
|
2022-06-05 20:20:01 +09:00
|
|
|
$curl = execcurl($o, 'cout', false, false, false);
|
|
|
|
$ptitle = mb_stristr($ptitle, '<title>');
|
|
|
|
$ptitle = mb_stristr($ptitle, '</title>', true);
|
2022-06-01 22:23:51 +09:00
|
|
|
$ptitle = trim($ptitle);
|
|
|
|
$ptitle = str_replace('<title>', '', $ptitle);
|
|
|
|
$ptitle = str_replace('</title>', '', $ptitle);
|
|
|
|
|
|
|
|
// 内容
|
|
|
|
$pbody = shell_exec('curl -s -L '.$o.' | pandoc -f html -t plain 2>&1');
|
2022-06-05 03:39:48 +09:00
|
|
|
if (strlen($o) > 100 || mb_strlen(htmlentities($ptitle)) > 100) continue;
|
2022-06-01 22:23:51 +09:00
|
|
|
|
|
|
|
$res[] = [
|
|
|
|
'website_id' => $k,
|
|
|
|
'url' => $o,
|
|
|
|
'title' => htmlentities($ptitle),
|
|
|
|
'body' => htmlentities($pbody)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach ($res as $k => $v) {
|
|
|
|
$rurl = $v['url'];
|
|
|
|
$rpid = null;
|
|
|
|
|
|
|
|
if ($site = mysqli_prepare($mysqli, "SELECT id FROM website_page WHERE url = ?")) {
|
|
|
|
mysqli_stmt_bind_param($site, "s", $rurl);
|
|
|
|
mysqli_stmt_execute($site);
|
|
|
|
mysqli_stmt_bind_result($site, $wpid);
|
|
|
|
mysqli_execute($site);
|
|
|
|
mysqli_stmt_fetch($site);
|
|
|
|
$rpid = $wpid;
|
|
|
|
$wpid = null;
|
|
|
|
mysqli_stmt_close($site);
|
|
|
|
|
|
|
|
if (is_null($rpid)) {
|
|
|
|
mysqli_query($mysqli, "INSERT INTO `website_page` (website_id, url, title, body) VALUES (".$v['website_id'].", '".$v['url']."', '".$v['title']."', '".$v['body']."');");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
mysqli_query($mysqli, "UPDATE `website_page` SET title = '".$v['title']."', body = '".$v['body']."' WHERE id = ".$rpid.";");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mysqli_close($mysqli);
|
2022-06-01 05:29:23 +09:00
|
|
|
?>
|