From 175041864649c6379505f9c8271328262d8cf49e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AB=8F=E8=A8=AA=E5=AD=90?= Date: Mon, 13 Nov 2023 16:52:46 +0900 Subject: [PATCH] =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E3=82=92?= =?UTF-8?q?=E5=88=86=E3=81=91=E3=81=9F=E6=96=B9=E3=81=8C=E8=89=AF=E3=81=84?= =?UTF-8?q?=E3=81=8B=E3=81=97=E3=82=89=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check.go | 34 ++++++++ lib.go | 44 ++++++++++ page.go | 232 ----------------------------------------------------- rmbloat.go | 170 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 248 insertions(+), 232 deletions(-) create mode 100644 check.go create mode 100644 lib.go create mode 100644 rmbloat.go diff --git a/check.go b/check.go new file mode 100644 index 0000000..70e4477 --- /dev/null +++ b/check.go @@ -0,0 +1,34 @@ +package main + +import ( + "strings" +) + +/* 記事かの確認 */ +func isarticle(url string) bool { + chk := strings.Split(url, "=") + return len(chk) > 2 && + (chk[0] == "/view_news.pl?id" || chk[0] == "/view_news.pl?from" || chk[0] == "/view_news.pl?media_id" || chk[0] == "/view_news.pl?stkt") +} + +/* 部分圏かの確認 */ +func issubcat(url string) bool { + chk := strings.Split(url, "=") + return len(chk) > 1 && + (chk[0] == "/list_news_category.pl?id" || chk[0] == "/list_news_category.pl?page" || chk[0] == "/list_news_category.pl?sort" || chk[0] == "/list_news_category.pl?type" || chk[0] == "/list_news_category.pl?sub_category_id") && + strings.Contains(url, "type=bn") +} + +/* 部分かの確認 */ +func iscategory(url string) bool { + chk := strings.Split(url, "=") + return len(chk) > 1 && + (chk[0] == "/list_news_category.pl?id" || chk[0] == "/list_news_category.pl?sub_category_id" || chk[0] == "/list_news_category?from") && + !strings.Contains(url, "type=bn") +} + +/* 出版社かの確認 */ +func ispublish(url string) bool { + chk := strings.Split(url, "=") + return len(chk) > 1 && (chk[0] == "/list_news_media.pl?id" || chk[0] == "/list_news_media.pl?page") +} diff --git a/lib.go b/lib.go new file mode 100644 index 0000000..f2701dd --- /dev/null +++ b/lib.go @@ -0,0 +1,44 @@ +package main + +import ( + "io/ioutil" + "strings" + "bytes" + + "golang.org/x/net/html" + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/transform" +) + +/* PHPであるstrip_tagsはGo言語で存在しないから、自分で作る */ +func strip_tags(data string) string { + doc, err := html.Parse(strings.NewReader(data)) + if err != nil { + panic("HTMLをパーシングに失敗。") + } + + var buf bytes.Buffer + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.TextNode { + buf.WriteString(n.Data) + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + + return buf.String() +} + +func EUCJPToUTF8(input []byte) (string, error) { + transformer := japanese.EUCJP.NewDecoder() + reader := transform.NewReader(bytes.NewReader(input), transformer) + result, err := ioutil.ReadAll(reader) + if err != nil { + return "エンコーディングに失敗", err + } + + return string(result), nil +} diff --git a/page.go b/page.go index d765002..27dbb12 100644 --- a/page.go +++ b/page.go @@ -6,46 +6,8 @@ import ( "net/http" "regexp" "strings" - "bytes" - - "golang.org/x/net/html" - "golang.org/x/text/encoding/japanese" - "golang.org/x/text/transform" ) -/* PHPであるstrip_tagsはGo言語で存在しないから、自分で作る */ -func strip_tags(data string) string { - doc, err := html.Parse(strings.NewReader(data)) - if err != nil { - panic("HTMLをパーシングに失敗。") - } - - var buf bytes.Buffer - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.TextNode { - buf.WriteString(n.Data) - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } - } - f(doc) - - return buf.String() -} - -func EUCJPToUTF8(input []byte) (string, error) { - transformer := japanese.EUCJP.NewDecoder() - reader := transform.NewReader(bytes.NewReader(input), transformer) - result, err := ioutil.ReadAll(reader) - if err != nil { - return "エンコーディングに失敗", err - } - - return string(result), nil -} - /* ページのタイトル */ func gettitle(str string) string { re := regexp.MustCompile("(.*)") @@ -71,200 +33,6 @@ func getdesc(str string) string { return strip_tags(res) } -/* 記事かの確認 */ -func isarticle(url string) bool { - chk := strings.Split(url, "=") - return len(chk) > 2 && - (chk[0] == "/view_news.pl?id" || chk[0] == "/view_news.pl?from" || chk[0] == "/view_news.pl?media_id" || chk[0] == "/view_news.pl?stkt") -} - -/* 部分圏かの確認 */ -func issubcat(url string) bool { - chk := strings.Split(url, "=") - return len(chk) > 1 && - (chk[0] == "/list_news_category.pl?id" || chk[0] == "/list_news_category.pl?page" || chk[0] == "/list_news_category.pl?sort" || chk[0] == "/list_news_category.pl?type" || chk[0] == "/list_news_category.pl?sub_category_id") && - strings.Contains(url, "type=bn") -} - -/* 部分かの確認 */ -func iscategory(url string) bool { - chk := strings.Split(url, "=") - return len(chk) > 1 && - (chk[0] == "/list_news_category.pl?id" || chk[0] == "/list_news_category.pl?sub_category_id" || chk[0] == "/list_news_category?from") && - !strings.Contains(url, "type=bn") -} - -/* 出版社かの確認 */ -func ispublish(url string) bool { - chk := strings.Split(url, "=") - return len(chk) > 1 && (chk[0] == "/list_news_media.pl?id" || chk[0] == "/list_news_media.pl?page") -} - -/* カテゴリーだけが残るまで消す */ -func rmcbloat(body string, cnf Config) string { - var re *regexp.Regexp - - rep := []struct { - pat string - repl string - }{ - {`(?s).*?`, ""}, - {`(?s).*?`, ""}, - {`(?s)
.*?
`, ""}, - {`(?s)