add basic metadata scraping for repos

このコミットが含まれているのは:
Arya Kiran 2023-02-14 20:24:33 +05:30
コミット 8aef883056
この署名に対応する既知のキーがデータベースに存在しません
GPGキーID: 842D12BDA50DF120
3個のファイルの変更68行の追加43行の削除

ファイルの表示

@ -1,27 +1,28 @@
package pages package pages
import ( import (
"context"
"log"
"codeberg.org/gothub/gothub/utils" "codeberg.org/gothub/gothub/utils"
"context"
"github.com/carlmjohnson/requests" "github.com/carlmjohnson/requests"
"github.com/gocolly/colly"
"github.com/gofiber/fiber/v2" "github.com/gofiber/fiber/v2"
"github.com/gomarkdown/markdown" "github.com/gomarkdown/markdown"
"log"
"net/http"
"os"
) )
type Repo struct { type Repo struct {
Fullname string Fullname string
Description string Description string
HtmlUrl string
Fork bool
Parent string Parent string
Stars int64 Stars string
Forks int64 Forks string
Watchers int64 Watchers string
Language string Language string
License string License string
DefaultBranch string DefaultBranch string
Readme string
} }
type RepoFiles struct { type RepoFiles struct {
@ -35,14 +36,20 @@ type RepoFiles struct {
func HandleRepo(c *fiber.Ctx) error { func HandleRepo(c *fiber.Ctx) error {
var repoArray []Repo var repoArray []Repo
var repoFilesArray []RepoFiles var repoFilesArray []RepoFiles
// get repo
repo := utils.GetRequest("https://api.github.com/repos/" + c.Params("user") + "/" + c.Params("repo")) resp, statusErr := http.Get("https://github.com/" + c.Params("user") + "/" + c.Params("repo"))
if repo.Get("message").String() == "Not Found" { if statusErr != nil {
log.Println(statusErr)
}
if resp.StatusCode == 404 {
// I need a better way to do this
return c.Status(404).Render("error", fiber.Map{ return c.Status(404).Render("error", fiber.Map{
"title": "Error", "title": "Error",
"error": "Repository " + c.Params("user") + "/" + c.Params("repo") + " not found", "error": "Repository " + c.Params("user") + "/" + c.Params("repo") + " not found",
}) })
} }
// API
repo := utils.GetRequest("https://api.github.com/repos/" + c.Params("user") + "/" + c.Params("repo"))
repoFiles := utils.GetRequest("https://api.github.com/repos/" + c.Params("user") + "/" + c.Params("repo") + "/contents") repoFiles := utils.GetRequest("https://api.github.com/repos/" + c.Params("user") + "/" + c.Params("repo") + "/contents")
bruh := repoFiles.Get("#.@pretty").Array() bruh := repoFiles.Get("#.@pretty").Array()
for _, item := range bruh { for _, item := range bruh {
@ -55,36 +62,54 @@ func HandleRepo(c *fiber.Ctx) error {
}) })
} }
var readmee string // Scraping
Scrape := Repo{
Language: repo.Get("language").String(),
}
UserAgent, ok := os.LookupEnv("GOTHUB_USER_AGENT")
if !ok {
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
sc := colly.NewCollector(colly.AllowedDomains("github.com"), colly.UserAgent(UserAgent))
sc.OnHTML("div.Layout-sidebar", func(e *colly.HTMLElement) {
Scrape.Fullname = c.Params("user") + "/" + c.Params("repo")
Scrape.Description = e.ChildText("p.f4")
Scrape.Stars = e.ChildText("a[href*='/" + c.Params("user") + "/" + c.Params("repo") + "/stargazers' i] strong")
Scrape.Watchers = e.ChildText("a[href*='/" + c.Params("user") + "/" + c.Params("repo") + "/watchers' i] strong")
Scrape.Forks = e.ChildText("a[href*='/" + c.Params("user") + "/" + c.Params("repo") + "/network/members' i] strong")
Scrape.License = e.ChildText("a[data-analytics-event*='{\"category\":\"Repository Overview\",\"action\":\"click\",\"label\":\"location:sidebar;file:license\"}']")
})
sc.OnHTML("div#readme", func(e *colly.HTMLElement) {
Scrape.Readme = e.ChildText("a[href*='#readme']")
})
sc.OnHTML("div#repository-container-header", func(e *colly.HTMLElement) {
Scrape.Parent = e.ChildText("span.text-small a")
})
sc.OnHTML("summary[title*='Switch branches or tags']", func(e *colly.HTMLElement) {
Scrape.DefaultBranch = e.ChildText("span.css-truncate-target")
})
sc.Visit("https://github.com/" + c.Params("user") + "/" + c.Params("repo") + "/")
// Add scrape-based info to repoArray
repoArray = append(repoArray, Scrape)
// README
var readmee string
err := requests. err := requests.
URL("https://raw.githubusercontent.com/" + c.Params("user") + "/" + c.Params("repo") + "/" + repo.Get("default_branch").String() + "/README.md"). URL("https://raw.githubusercontent.com/" + c.Params("user") + "/" + c.Params("repo") + "/" + Scrape.DefaultBranch + "/" + Scrape.Readme).
ToString(&readmee). ToString(&readmee).
Fetch(context.Background()) Fetch(context.Background())
if err != nil { if err != nil {
readmee = "" readmee = ""
log.Println(err) log.Println(err)
} }
mightBeUnsafe := markdown.ToHTML([]byte(readmee), nil, nil) mightBeUnsafe := markdown.ToHTML([]byte(readmee), nil, nil)
// Trust Nobody // Trust Nobody
readmeOutput := utils.UGCPolicy().SanitizeBytes(mightBeUnsafe) readmeOutput := utils.UGCPolicy().SanitizeBytes(mightBeUnsafe)
repoArray = append(repoArray, Repo{
Fullname: repo.Get("full_name").String(),
Description: repo.Get("description").String(),
HtmlUrl: repo.Get("html_url").String(),
Fork: repo.Get("fork").Bool(),
Stars: repo.Get("stargazers_count").Int(),
Forks: repo.Get("forks_count").Int(),
Watchers: repo.Get("watchers_count").Int(),
Language: repo.Get("language").String(),
License: repo.Get("license").Get("name").String(),
Parent: repo.Get("parent").Get("full_name").String(),
DefaultBranch: repo.Get("default_branch").String(),
})
return c.Render("repo", fiber.Map{ return c.Render("repo", fiber.Map{
"title": "Repository " + c.Params("user") + "/" + c.Params("repo"), "title": "Repository " + c.Params("user") + "/" + c.Params("repo"),
"repo": repoArray, "repo": repoArray,

ファイルの表示

@ -5,7 +5,6 @@ import (
"log" "log"
"net/http" "net/http"
"os" "os"
"strconv"
"strings" "strings"
"codeberg.org/gothub/gothub/utils" "codeberg.org/gothub/gothub/utils"
@ -26,8 +25,8 @@ type User struct {
Location string Location string
Email string Email string
Timezone string Timezone string
Following int64 Following string
Followers int64 Followers string
Link string Link string
Social []string Social []string
Organizations []string Organizations []string
@ -108,8 +107,8 @@ func HandleUser(c *fiber.Ctx) error {
Scrape.Social = append(Scrape.Social, el.ChildText("a.Link--primary")) Scrape.Social = append(Scrape.Social, el.ChildText("a.Link--primary"))
}) })
// Followers/Following // Followers/Following
Scrape.Followers, err = strconv.ParseInt(e.ChildText("a[href*='https://github.com/"+c.Params("user")+"?tab=followers' i] span"), 10, 64) Scrape.Followers = e.ChildText("a[href*='https://github.com/" + c.Params("user") + "?tab=followers' i] span")
Scrape.Following, err = strconv.ParseInt(e.ChildText("a[href*='https://github.com/"+c.Params("user")+"?tab=following' i] span"), 10, 64) Scrape.Following = e.ChildText("a[href*='https://github.com/" + c.Params("user") + "?tab=following' i] span")
// Organizations // Organizations
e.ForEach("a[data-hovercard-type*='organization']", func(i int, el *colly.HTMLElement) { e.ForEach("a[data-hovercard-type*='organization']", func(i int, el *colly.HTMLElement) {
Scrape.Organizations = append(Scrape.Organizations, el.Attr("aria-label")) Scrape.Organizations = append(Scrape.Organizations, el.Attr("aria-label"))
@ -135,7 +134,7 @@ func HandleUser(c *fiber.Ctx) error {
Scrape.Social = append(Scrape.Social, el.Attr("href")) Scrape.Social = append(Scrape.Social, el.Attr("href"))
}) })
// Followers // Followers
Scrape.Followers, err = strconv.ParseInt(e.ChildText("a[href*='/orgs/"+c.Params("user")+"/followers' i] span"), 10, 64) Scrape.Followers = e.ChildText("a[href*='/orgs/" + c.Params("user") + "/followers' i] span")
}) })
sc.OnHTML("img[alt*='@"+c.Params("user")+"' i]", func(e *colly.HTMLElement) { sc.OnHTML("img[alt*='@"+c.Params("user")+"' i]", func(e *colly.HTMLElement) {
Scrape.AvatarUrl = e.Attr("src") Scrape.AvatarUrl = e.Attr("src")

ファイルの表示

@ -10,20 +10,17 @@
<div class="userProfile"> <div class="userProfile">
<h1>{{.Fullname}}</h1> <h1>{{.Fullname}}</h1>
{{ if .Fork }} {{ if .Parent }}
<p>This repository is a fork of <a href="/{{.Parent}}">{{.Parent}}</a>.</p> <p>This repository is a fork of <a href="/{{.Parent}}">{{.Parent}}</a>.</p>
{{ end }} {{ end }}
{{ if .Description }} {{ if .Description }}
<p>{{.Description}}</p> <p>{{.Description}}</p>
{{ end }} {{ end }}
{{ if .Language}}
{{ if .License }} {{ if .License }}
<p>⭐ {{.Stars}} 🍴 {{.Forks}} 👀 {{.Watchers}} ⚖️ {{.License}} 🗒️ {{.Language}}</p> <p>⭐ {{.Stars}} 🍴 {{.Forks}} 👀 {{.Watchers}} ⚖️ {{.License}} {{ if .Language }} 🗒️ {{.Language}} {{end}} 🌿 {{.DefaultBranch}}</p>
{{ else }} {{ else }}
<p>⭐ {{.Stars}} 🍴 {{.Forks}} 👀 {{.Watchers}} ⚖️ No license 🗒️ {{.Language}}</p> <p>⭐ {{.Stars}} 🍴 {{.Forks}} 👀 {{.Watchers}} ⚖️ No license {{ if .Language }} 🗒️ {{.Language}} {{end}} 🌿 {{.DefaultBranch}}</p>
{{ end }} {{ end }}
{{ else }}
{{ end }}
</div> </div>
{{end}} {{end}}
{{ if .files}} {{ if .files}}
@ -44,7 +41,11 @@
{{ end }} {{ end }}
{{ if .readme}} {{ if .readme}}
<div class="userReadme"> <div class="userReadme">
<h3>README.md</h3> {{ if .repo }}
{{ range $key, $value := .repo}}
<h3>{{.Readme}}</h3>
{{end}}
{{end}}
<div class="userReadmeText"> <div class="userReadmeText">
{{ unescape .readme}} {{ unescape .readme}}
</div> </div>
@ -54,4 +55,4 @@
<h2>Repository not found</h2> <h2>Repository not found</h2>
<p>That repository doesn't exist.</p> <p>That repository doesn't exist.</p>
{{ end }} {{ end }}
</main> </main>