v0.2.0 Idol data scraper and unrating.

- Add binary `juniority-lookup` that scrapes idol data from
  a junior idol information site.
- Add unrating function to make rated items unrated again.
- Enable (un)rating idols and volumes with no menus.
このコミットが含まれているのは:
Jay Eye 2023-04-01 12:12:22 -04:00
コミット 6f62dd2446
7個のファイルの変更513行の追加18行の削除

ファイルの表示

@ -45,6 +45,7 @@ BINDIR="$HOME/.local/bin"
pushd $BINDIR
ln -sr $JUNIORITY_DIR/bin/juniority.scm ./juniority
ln -sr $JUNIORITY_DIR/bin/juniority-import.scm ./juniority-import
ln -sr $JUNIORITY_DIR/bin/juniority-lookup.scm ./juniority-lookup
```
### Database
@ -63,3 +64,4 @@ The binaries display help when invoked without arguments:
- `juniority`
- `juniority-import`
- `juniority-lookup`

ファイルの表示

@ -33,15 +33,20 @@ coding: utf-8
(define (main args)
(match args
((bin "help" . other) (display-help bin))
((bin "-h" . other) (display-help bin))
((bin "--help" . other) (display-help bin))
((bin idols-file movies-file)
(let ((idols-hash (main-idols idols-file)))
(main-movies idols-hash movies-file)))
((bin . other)
(format #t "Usage: ~a path/to/idols.tsv path/to/movies.tsv
((bin . other) (display-help bin))
))
(define (display-help bin)
(format (current-error-port) "Usage: ~a path/to/idols.tsv path/to/movies.tsv
This utility is meant to be used in concert with `juniority-import`. E.g.:
~a path/to/idols.tsv path/to/movies.tsv | juniority-import path/to/juniority.db
" bin bin))))
" bin bin))
(define (main-idols idols-file)
(let ((idols-hash (make-hash-table))

ファイルの表示

@ -18,9 +18,14 @@ coding: utf-8
(define (main args)
(match args
((bin "help" . rest) (display-help bin))
((bin "-h" . rest) (display-help bin))
((bin "--help" . rest) (display-help bin))
((bin db-path) (main-import db-path))
((bin . _)
(format #t "Import:
((bin . _) (display-help bin))
))
(define (display-help bin)
(format #t "Import:
In order to import your existing records, it must be in the following format:
stmt = volume-stmt | idol-stmt | tag-stmt
volume-stmt = studio-[0-9]+ idol-assoc* tag-assoc* pub?
@ -63,7 +68,6 @@ Use import mode in the future to add records to your database. For manual entry,
...
EOF
" bin bin))
))
(define (main-import db-path)
(let ((db (open-db db-path))

434
bin/juniority-lookup.scm 実行可能ファイル
ファイルの表示

@ -0,0 +1,434 @@
#!/usr/bin/env -S guile -e main -s
coding: utf-8
!#
(use-modules
(ice-9 exceptions)
(ice-9 format)
(ice-9 match)
(ice-9 peg)
(ice-9 rdelim)
(ice-9 receive)
(ice-9 regex)
(ice-9 string-fun)
(juniority grammar)
(juniority util)
(srfi srfi-1)
(sxml fold)
(sxml simple)
(web client)
(web response)
(web uri)
)
(define-peg-pattern JYEAR none "年")
(define-peg-pattern JMONTH none "月")
(define-peg-pattern JDAY none "日")
(define-peg-pattern digit body (range #\0 #\9))
(define-peg-pattern year all (+ digit))
(define-peg-pattern month all (+ digit))
(define-peg-pattern day all (+ digit))
(define-peg-pattern japan-date all
(and year JYEAR month JMONTH day JDAY))
(define-peg-pattern japan-search-results all
(and (ignore "検索結果:")
(+ digit)
(ignore "件")))
(define (main args)
(match args
((bin "help" . rest) (display-help bin))
((bin "-h" . rest) (display-help bin))
((bin "--help" . rest) (display-help bin))
((bin "idol" idol) (lookup-idol idol))
((bin "volume" volume) (lookup-product volume))
((bin . rest) (display-help bin))
))
(define (display-help bin)
(format (current-error-port) "Usage: ~a idol <idol_name>
Usage: ~a volume <volume_search_string>
This tool can be used to retrieve an idol's videography,
the cast and publication date of a particular volume, or
information about a collection of volumes.
The output will be in Juniority import format, so you can
pipe it into `juniority-import' to populate your database.
-----------------------------------------------------------
------------- WARNING! NETWORK ACTIVITY -------------------
-----------------------------------------------------------
Because this application will contact a site that hosts
junior idol information, if this is not legal or acceptable
in your jurisdiction, we recommend you use `torsocks' or
connect to a VPN before running this application!
torsocks ~a <kind> <query>
-----------------------------------------------------------
-----------------------------------------------------------
-----------------------------------------------------------
" bin bin bin))
(define (lookup-idol idol-name)
(let* ((idol-search-result
(format #f
"https://u15dvdinfo.com/?s=~a&s_type=idols"
idol-name)))
(map (compose
export-idol-data
scrape-idol-data)
(scrape-idol-links (fetch-link idol-search-result)))))
(define (fetch-link url)
(let ((cached-link (hash-ref fetch-link-cache url)))
(if cached-link
cached-link
(begin
(prepare-request)
(receive (res res-body)
(http-request url)
(case (response-code res)
((200)
(hash-set! fetch-link-cache url res-body)
res-body)
(else #f)))))))
(define (scrape-idol-links markup)
(let* ((html (sanitize-markup markup))
(links-only (list-matches page-simple-link-re html))
(the-links (string-append
"<body>"
(string-concatenate (map match:substring links-only))
"</body>"))
(idol-urls (list)))
(foldt
(lambda (stuff)
(match stuff
(('a ('@ ('href url)) _)
(if (and (string? url)
(or (string-prefix? "https://u15dvdinfo.com/idols/" url)
(string-prefix? "/idols/" url)))
(set! idol-urls (cons url idol-urls))
(list 'href url)))
(other other)))
identity
(xml->sxml the-links))
idol-urls))
(define (scrape-idol-data idol-url)
(let* ((markup (fetch-link idol-url))
(html (sanitize-markup markup))
(idol-table
(let* ((start (string-contains-ci html "<table class=\"idol\""))
(end (string-contains-ci html "</table>" (or start 0)))
)
(if (and start end)
(substring html start (+ end 8))
"<table class=\"idol\"></table>")))
(products-table
(let* ((start
(string-contains-ci html "<table class=\"products\""))
(end (string-contains-ci html "</table>" (or start 0)))
)
(if (and start end)
(substring html start (+ end 8))
"<table class=\"products\"></table>")))
(the-tables (string-append "<body>"
idol-table
products-table
"</body>"))
(idol-en-name (extract-idol-en-name idol-url))
(idol-name #nil)
(idol-birth #nil)
(idol-volumes (list)))
(foldt
(lambda (stuff)
(match stuff
(('tr _ ('th ('@ _ ('class "idol_t")) "名前") _ ('td _ name))
(set! idol-name
(name-remove-kana name))
#t)
(('tr ('th ('@ ('class "idol")) "生年月日") ('td _ birth))
(set! idol-birth (peg:tree (match-pattern japan-date birth)))
#t)
(('tr ('@ ('class medium)) _ _ _ ('td pub) _ _ _ ('td ident) _ _ _ _ _)
(set! idol-volumes
(cons
(list 'volume
ident
(regexp-substitute/global #f "/" pub 'pre "-" 'post)
)
idol-volumes))
#t)
(other other)))
identity
(xml->sxml the-tables))
(list 'idol idol-name idol-en-name idol-birth idol-volumes)))
(define (extract-idol-en-name url)
(let* ((name-start (+ 1 (string-index-right url #\/)))
(name-portion (substring url name-start))
(sep-idx (or
(string-index name-portion #\_)
(string-index name-portion #\-)))
(initial1 (string-ref name-portion 0))
)
(string-set! name-portion 0 (char-upcase initial1))
(when sep-idx
(let ((initial2 (string-ref name-portion (+ 1 sep-idx))))
(string-set! name-portion sep-idx #\space)
(string-set! name-portion (+ 1 sep-idx) (char-upcase initial2))
))
name-portion))
(define (export-idol-data idol)
(match idol
(('idol #nil _ _ _)
(format (current-error-port)
"Warning: idol without name:\n\t~s\n" idol))
(('idol name en-name #nil volumes)
(format #t "@'~a' '~a'\n" name en-name)
(for-each
(export-volume-featuring name)
volumes))
(('idol name en-name ('japan-date ('year year) ('month month) ('day day)) volumes)
(format #t "@'~a' '~a' ~a-~a-~a\n" name en-name year month day)
(for-each
(export-volume-featuring name)
volumes))
(other
(format (current-error-port)
"Error: export-idol-data: unacceptable idol object:\n\t~s\n"
idol))))
(define (export-volume-featuring idol-name)
(lambda (volume)
(match volume
(('volume ident #nil)
(format #t "~a @'~a'\n" ident idol-name))
(('volume ident published)
(format #t "~a @'~a' ~a\n" ident idol-name published))
(other
(format (current-error-port)
"Error: export-volume-featuring: unacceptable volume object:\n\t~s\n" volume)))))
(define (lookup-product volume-ident)
(let* ((product-search-fmt
(format #f
"https://u15dvdinfo.com/page/~~a?s=~a&s_type=products"
volume-ident))
(product-links (list))
(fetched-idols-hash (make-hash-table))
(fetched-products-hash (make-hash-table))
)
(do ((page 1 (+ 1 page))
(page-html (fetch-link (format #f product-search-fmt 1)))
(stop? #f)
(search-result-text #nil)
(links #nil))
(stop?)
(if page-html
(begin
(set! search-result-text
(match:substring (regexp-exec search-results-re page-html)))
(match
(peg:tree (match-pattern japan-search-results search-result-text))
(('japan-search-results "0") (set! stop? #t))
(('japan-search-results num)
(set! links (scrape-product-links page-html))
(set! product-links (append links product-links))
(set! page-html
(fetch-link (format #f product-search-fmt (+ 1 page)))))))
(set! stop? #t))
(map (compose
export-product-data
(scrape-product-data fetched-products-hash fetched-idols-hash)
fetch-link)
product-links))))
(define (scrape-product-links markup)
(let* ((html (sanitize-markup markup))
(links-only (list-matches page-simple-link-re html))
(the-links (string-append
"<body>"
(string-concatenate (map match:substring links-only))
"</body>"))
(product-urls (list)))
(foldt
(lambda (stuff)
(match stuff
(('a ('@ ('href url)) _)
(if
(and (string? url)
(or (string-prefix? "https://u15dvdinfo.com/products/" url)
(string-prefix? "/products/" url)))
(set! product-urls (cons url product-urls))))
(other other)))
identity
(xml->sxml the-links))
product-urls))
(define (scrape-product-data fetched-products fetched-idols)
(lambda (markup)
(let* ((html (sanitize-markup markup))
(product-info-table
(let* ((start
(string-contains-ci html "<table class=\"pro_info\""))
(end (string-contains-ci html "</table>" (or start 0)))
)
(if (and start end)
(substring html start (+ end 8))
"<table class=\"pro_info\"></table>")))
(product-idols-table
(let* ((start
(string-contains-ci html "<table class=\"idol_info\""))
(end (string-contains-ci html "</table>" (or start 0)))
)
(if (and start end)
(substring html start (+ end 8))
"<table class=\"idol_info\"></table>")))
(the-tables (string-append "<body>"
product-info-table
product-idols-table
"</body>"))
(product-id #nil)
(product-published #nil)
(product-idols (list))
(product-related (list)))
(foldt
(lambda (stuff)
(match stuff
(('tr ('th "品番") ('td volume-id))
(set! product-id volume-id))
(('tr ('th "発売日") ('td volume-pub))
(set! product-published
(peg:tree (match-pattern japan-date volume-pub))))
(('td ('a ('@ ('href idol-rel-url)) idol-name))
(cond
((string-prefix? "/idol/" idol-rel-url)
(let ((abs-url
(format #f "https://u15dvdinfo.com~a"
(string-replace idol-rel-url "s" 5 5 0 1)))
)
(set! product-idols
(cons (list 'idol idol-name abs-url)
product-idols))))
((eq? #f (string-index idol-rel-url #\/))
(set! product-related
(cons
(list
'other
(format #f "https://u15dvdinfo.com~a"
(string-append "/products/" idol-rel-url)))
product-related)))
(else
(set! product-related
(cons
(list
'other
(format #f "https://u15dvdinfo.com~a"
idol-rel-url))
product-related)))))
(other other)))
identity
(xml->sxml the-tables))
(for-each
(match-lambda
(('idol name url)
(unless (hash-ref fetched-idols name)
(format #t "@'~a'\n" name)
(let* ((html (fetch-link url)))
(if html
((compose export-idol-data scrape-idol-data) url))
(hash-set! fetched-idols name url)))))
product-idols)
(if (hash-ref fetched-products product-id)
#f
(begin
(hash-set!
fetched-products
product-id
(list 'product product-id product-published product-idols))
(list 'product product-id product-published product-idols))
))))
(define (export-product-data product)
(match product
(('product #nil _ _)
(format (current-error-port)
"Warning: product without ident:\n\t~s\n" product))
(('product ident #nil idols)
(format #t "~a~a\n"
ident
(string-concatenate
(map (match-lambda
(('idol name _) (format #f " @'~a'" name)))
idols))
))
(('product ident ('japan-date ('year year) ('month month) ('day day)) idols)
(format #t "~a~a~a\n"
ident
(string-concatenate
(map (match-lambda
(('idol name _) (format #f " @'~a'" name)))
idols))
(format #f " ~a-~a-~a" year month day)))
(#f (noop))
(other
(format (current-error-port)
"Error: export-product-data: unacceptable product object:\n\t~s\n"
product))))
(define (prepare-request) (sleep 1))
(define (sanitize-markup markup)
(let* ((no-br (regexp-substitute/global
#f "<br[^>]*>"
markup
'pre 'post))
(no-hr (regexp-substitute/global
#f "<hr[^>]*>"
no-br
'pre 'post))
(no-img (regexp-substitute/global
#f "<img[^>]*>"
no-hr
'pre 'post))
(no-input (regexp-substitute/global
#f "<input[^>]*>"
no-img
'pre 'post))
(no-comment (regexp-substitute/global
#f "<!--[^>]*>"
no-input
'pre 'post))
(no-entity (regexp-substitute/global
#f "&[^;]*;"
no-comment
'pre 'post))
)
no-entity))
(define fetch-link-cache (make-hash-table))
(define page-simple-link-re
(make-regexp "<a[^>]*>[^<]*</a>"))
(define search-results-re
(make-regexp "検索結果:[0-9]+件"))
(define (name-remove-kana name)
(car (string-split (string-drop name 1) japan-paren-open)))
(define japan-paren-open (integer->char 65288))

ファイルの表示

@ -44,12 +44,17 @@ coding: utf-8
(define (main args)
(match args
((bin "help" . rest) (display-help bin))
((bin "-h" . rest) (display-help bin))
((bin "--help" . rest) (display-help bin))
((bin db-path) (main-tui db-path))
((bin . _) (format #t "Usage:
((bin . _) (display-help bin))))
(define (display-help bin)
(format #t "Usage:
Interactive mode: ~a path/to/db.sqlite3
For more details, wait for this help text to be improved.
" bin))))
" bin))
(define (main-tui db-path)
(let ((db (open-db db-path))
@ -126,7 +131,7 @@ For more details, wait for this help text to be improved.
(set-interactive-state-in-menu! state #f)
(goto-top-view! db state))
(other (render-key-press state other))))
(('idol #f _ _ _ _)
(('idol #f _ idol _ _)
(match ch
(#\esc
(if (history? state)
@ -138,6 +143,16 @@ For more details, wait for this help text to be improved.
(#\t
(history-delete! state)
(goto-top-view! db state))
(#\`
(param-query-one db rate-idol-stmt
#nil
(vector-ref idol 0))
(goto-idol-detail-view! db state (vector-ref idol 3)))
((? char?digit? rating)
(param-query-one db rate-idol-stmt
(digit->number rating)
(vector-ref idol 0))
(goto-idol-detail-view! db state (vector-ref idol 3)))
(other (render-key-press state other))
))
(('idol menu _ idol _ _)
@ -168,6 +183,13 @@ For more details, wait for this help text to be improved.
(set-interactive-state-in-menu! state #f)
(history-delete! state)
(goto-top-view! db state))
(#\`
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
(param-query-one db rate-idol-stmt
#nil
(vector-ref idol 0))
(goto-idol-detail-view! db state (vector-ref idol 3)))
((? char?digit? rating)
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
@ -204,6 +226,16 @@ For more details, wait for this help text to be improved.
(#\t
(history-delete! state)
(goto-top-view! db state))
(#\`
(param-query-one db rate-volume-stmt
#nil
(vector-ref volume 0))
(goto-volume-detail-view! db state (vector-ref volume 1)))
((? char?digit? rating)
(param-query-one db rate-volume-stmt
(digit->number rating)
(vector-ref volume 0))
(goto-volume-detail-view! db state (vector-ref volume 1)))
(other (render-key-press state other))))
(('volume menu _ volume _ _)
(update-menu! menu ch)
@ -232,6 +264,13 @@ For more details, wait for this help text to be improved.
(set-interactive-state-in-menu! state #f)
(history-delete! state)
(goto-top-view! db state))
(#\`
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
(param-query-one db rate-volume-stmt
#nil
(vector-ref volume 0))
(goto-volume-detail-view! db state (vector-ref volume 1)))
((? char?digit? rating)
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
@ -296,6 +335,13 @@ For more details, wait for this help text to be improved.
(set-interactive-state-in-menu! state #f)
(history-delete! state)
(goto-top-view! db state))
(#\`
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
(param-query-one db rate-studio-stmt
#nil
(vector-ref studio 0))
(goto-studio-detail-view! db state (vector-ref studio 1)))
((? char?digit? rating)
(unpost-menu menu)
(set-interactive-state-in-menu! state #t)
@ -436,6 +482,7 @@ For more details, wait for this help text to be improved.
(vector-ref result 0)
(idol-desc result)))
search-idols-stmt
search-string
search-string))
('volumes
(param-query-map
@ -445,6 +492,7 @@ For more details, wait for this help text to be improved.
(vector-ref result 0)
(volume-desc result)))
search-volumes-stmt
search-string
search-string))))
(result-menu (if (null? result-items)
#f
@ -1092,7 +1140,7 @@ For more details, wait for this help text to be improved.
" yju uni uni ri "
" Other tools: jun nio iori "
" - juniority-import uni ior rit "
" nio ori ity "
" - juniority-lookup nio ori ity "
" ior rit tyj "
" ori ity yju "
" ority tyj jun "

ファイルの表示

@ -185,18 +185,20 @@ WHERE id = ?")
(define search-idols-stmt
"SELECT idol.jp_name, idol.birth_date, idol.rating
FROM idol_tag
JOIN idol ON idol_tag.idol_id = idol.id
JOIN tag ON idol_tag.tag_id = tag.id
FROM idol
LEFT OUTER JOIN idol_tag ON idol_tag.idol_id = idol.id
LEFT OUTER JOIN tag ON idol_tag.tag_id = tag.id
WHERE tag.name LIKE '%' || ? || '%'
OR idol.jp_name LIKE '%' || ? || '%'
ORDER BY idol.rating DESC NULLS LAST, idol_tag.id DESC")
(define search-volumes-stmt
"SELECT volume.identifier, volume.published, volume.rating
FROM volume_tag
JOIN volume ON volume_tag.volume_id = volume.id
JOIN tag ON volume_tag.tag_id = tag.id
FROM volume
LEFT OUTER JOIN volume_tag ON volume_tag.volume_id = volume.id
LEFT OUTER JOIN tag ON volume_tag.tag_id = tag.id
WHERE tag.name LIKE '%' || ? || '%'
OR volume.identifier LIKE '%' || ? || '%'
ORDER BY volume.rating DESC NULLS LAST, volume_tag.id DESC")
(define select-all-idols-stmt
@ -204,7 +206,7 @@ ORDER BY volume.rating DESC NULLS LAST, volume_tag.id DESC")
ORDER BY rating DESC NULLS LAST, birth_date DESC NULLS LAST")
(define select-all-studios-stmt
"SELECT call_sign, full_name, rating FROM studio
ORDER BY call_sign ASC")
ORDER BY rating DESC NULLS LAST, call_sign ASC")
(define select-all-tags-stmt
"SELECT name FROM tag
ORDER BY name ASC")

ファイルの表示

@ -9,7 +9,7 @@
(define-syntax log
(syntax-rules ()
((log fmt expr expr1 ...)
(format #t (format #f "~s\n" fmt) expr expr1 ...))
(format #t (format #f "~a\n" fmt) expr expr1 ...))
((log expr) (format #t "~a\n" expr))
))