spliti/page.go

116 行
2.9 KiB
Go

package main
import (
"fmt"
"io"
"net/http"
"regexp"
"strings"
)
/* ページのタイトル */
func gettitle(str string) string {
re := regexp.MustCompile("<title>(.*)</title>")
matches := re.FindStringSubmatch(str)
if len(matches) > 1 {
return matches[1]
}
return ""
}
func getimg(str string, cnf Config) string {
re := regexp.MustCompile(`<img class="NEWS_tempPhoto__picture" src="(.*)" alt="">`)
matches := re.FindStringSubmatch(str)
if len(matches) > 1 {
return strings.Replace(matches[1], "https://", cnf.imgproxy+"/", -1)
}
return ""
}
func getdesc(str string) string {
re := regexp.MustCompile(`<div class="newsArticle">(.*?)</div>`)
res := re.ReplaceAllString(str, "")
return strip_tags(res)
}
/* 記事の受取 */
func get(url string, cnf Config) map[string]string {
// デフォルト=エラー
res := make(map[string]string)
res["title"] = "見つけられない"
res["content"] = `
<div class="newsArticle"><div class="articleHeading02">
<div class="headingArea">
<h1>見つけられなかった</h1>
</div>
</div>
<div class="contents clearfix">
<div class="article decoratable">
<p>ごめんね!</p>
</div>
</div>
`
res["img"] = ""
res["desc"] = ""
res["err"] = ""
resp, err := http.Get("https://news.mixi.jp" + url)
if err != nil {
res["err"] = "URLエラー"
fmt.Println(res["err"] + ": " + err.Error())
return res
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
bytebody, err := io.ReadAll(resp.Body)
if err != nil {
res["err"] = "内容はバイトコードとして読み込みに失敗。"
fmt.Println(res["err"])
return res
}
body, err := EUCJPToUTF8(bytebody)
if err != nil {
res["err"] = err.Error()
fmt.Println(res["err"])
return res
}
id, _ := getid(url)
res["title"] = gettitle(body)
if isarticle(url) {
if !strings.Contains(body, "newsArticle") {
res["content"] = rmebloat(body, cnf)
} else {
res["img"] = getimg(body, cnf)
res["content"] = rmbloat(id, body, cnf)
}
} else if ispublish(url) {
res["content"] = rmpbloat(body, cnf)
} else if issubcat(url) {
if strings.Contains(body, `<p class="messageAlert">存在しないカテゴリです</p>`) {
res["content"] = rmebloat(body, cnf)
} else {
res["content"] = rmsbloat(body, cnf)
}
} else if istubayaki(url) {
if !strings.Contains(body, "quoteList") {
res["content"] = rmebloat(body, cnf)
} else {
res["content"] = rmqbloat(body, cnf)
}
} else {
if !strings.Contains(body, "注目のニュース") {
res["content"] = rmebloat(body, cnf)
} else {
res["content"] = rmcbloat(body, cnf)
}
}
res["desc"] = getdesc(res["content"])
}
return res
}