preset scraper added

This commit is contained in:
2024-09-10 15:57:48 +03:00
parent bfdfc634e4
commit 0ecf0ddec1
38 changed files with 790 additions and 167 deletions

108
pkg/repository/prescene.go Normal file
View File

@@ -0,0 +1,108 @@
package repository
import (
"fmt"
"log/slog"
"regexp"
"slices"
"strconv"
"time"
"github.com/go-shiori/dom"
"github.com/golang-module/carbon/v2"
"github.com/jmoiron/sqlx"
"github.com/spf13/viper"
"git.amok.space/yevhen/resource-scraper/helper/parser"
"git.amok.space/yevhen/resource-scraper/pkg/repository/table"
"git.amok.space/yevhen/resource-scraper/types/constant"
"git.amok.space/yevhen/resource-scraper/types/model"
)
type Prescene struct {
db *sqlx.DB
}
func NewPresceneRepository(db *sqlx.DB) *Prescene {
return &Prescene{db: db}
}
func (s *Prescene) GetPage(pageNumbers []string) ([]model.ExternalSources, error) {
entries := make([]model.ExternalSources, 0)
endpoint := viper.GetString(constant.CfgKeyEndpoint)
//scope := viper.GetString(constant.CfgKeyScopeEnable)
tags := viper.GetStringMapStringSlice("groups.tags")
for _, t := range pageNumbers {
if t != "1" {
endpoint += fmt.Sprintf(viper.GetString(constant.CfgKeyEndpointNext), t)
}
doc, err := parser.HTMLSourceFromURL(endpoint)
//doc, err := parser.HTMLSourceFromURL("https://mdb.amok.space/$/scnlog.html")
if err != nil {
slog.Error("Parse error", "err", err)
continue
}
if doc == nil {
slog.Warn("Document is nil", "err", err)
continue
}
for _, item := range dom.QuerySelectorAll(doc, ".post.type-post.category-flac.category-music") {
var es model.ExternalSources
columns := []string{"`type`", "type_id", "title", "eXsource", "releaser", "created"}
title := dom.QuerySelector(item, ".title")
if title != nil {
anchor := dom.QuerySelector(title, "h1 > a")
if anchor != nil {
es.Type = constant.ScopePrescene
es.Title = dom.GetAttribute(anchor, "title")
es.ExSource = dom.GetAttribute(anchor, "href")
pattern := regexp.MustCompile(`(?is)-(\w+)$`)
es.Releaser = pattern.FindStringSubmatch(es.Title)[1]
for flag, groups := range tags {
if slices.Contains(groups, es.Releaser) {
es.A = flag
es.H = flag
columns = append(columns, "a", "h")
break
}
}
}
if es.A == constant.TagIgnore {
slog.Info("Skipped", "releaser", es.Releaser)
continue
}
localtime := dom.QuerySelector(title, "small > span.localtime")
if localtime != nil {
lc := dom.GetAttribute(localtime, "data-lttime")
es.Created = carbon.Parse(lc)
}
}
cls := dom.GetAttribute(item, "class")
pattern := regexp.MustCompile(`(?s)^post-(\d+)\spost`)
es.TypeId, _ = strconv.Atoi(pattern.FindStringSubmatch(cls)[1])
//fmt.Println("====================== ", i, " ==============================")
esModel := table.ExternalSources{Columns: columns}
entry := esModel.InsertOnDuplicate(es, s.db)
entries = append(entries, entry)
//fmt.Printf("%+v\n", entry)
}
//fmt.Println("Sleeping...", j)
time.Sleep(viper.GetDuration(constant.CfgKeySleepBeforeNextIteration))
}
//fmt.Printf("scope: %v\n", scope)
return entries, nil
}

View File

@@ -2,22 +2,18 @@ package repository
import (
"github.com/jmoiron/sqlx"
"github.com/spf13/viper"
"git.amok.space/yevhen/resource-scraper/types"
"git.amok.space/yevhen/resource-scraper/types/interface"
)
type Repository struct {
types.Rutracker
_interface.Rutracker
_interface.Prescene
}
func New(db *sqlx.DB) *Repository {
switch viper.GetString("scope") {
case types.RuTracker:
return &Repository{
Rutracker: NewRutracker(db),
}
return &Repository{
Rutracker: NewRutrackerRepository(db),
Prescene: NewPresceneRepository(db),
}
return &Repository{}
}

View File

@@ -8,60 +8,59 @@ import (
"net/http"
"net/url"
"strconv"
"time"
"github.com/golang-module/carbon/v2"
"github.com/jmoiron/sqlx"
"github.com/spf13/viper"
iface "git.amok.space/yevhen/resource-scraper/types"
"git.amok.space/yevhen/resource-scraper/types/table"
"git.amok.space/yevhen/resource-scraper/pkg/repository/table"
"git.amok.space/yevhen/resource-scraper/types/constant"
"git.amok.space/yevhen/resource-scraper/types/model"
"git.amok.space/yevhen/resource-scraper/types/resource"
)
type Rutracker struct {
db *sqlx.DB
}
func NewRutracker(db *sqlx.DB) *Rutracker {
func NewRutrackerRepository(db *sqlx.DB) *Rutracker {
return &Rutracker{db: db}
}
func (s *Rutracker) GetTopic(topics []string) error {
endpoint := viper.GetString("endpoint")
func (s *Rutracker) GetTopic(topics []string) ([]model.ExternalSources, error) {
endpoint := viper.GetString(constant.CfgKeyEndpoint)
entries := make([]model.ExternalSources, 0)
columns := []string{"`type`", "type_id", "title", "type_subsection_id", "releaser", "created"}
for _, t := range topics {
topic, err := fetch(fmt.Sprintf(endpoint, t))
if err != nil {
slog.Error("couldn't parse topic data", "err", err.Error())
return entries, err
}
for i, e := range topic.Entry {
var id int
var es table.ExternalSources
for _, e := range topic.Entry {
var es model.ExternalSources
u, _ := url.Parse(e.Link.Href)
es.Type = "rutracker"
es.Type = constant.ScopeRuTracker
es.TypeId, _ = strconv.Atoi(u.Query().Get("t"))
es.Title = e.Title
es.TypeSubsectionId, _ = strconv.Atoi(t)
es.Releaser = e.Author.Name
es.Created, _ = time.Parse(time.RFC3339, e.Updated)
created := es.Created.Format(iface.DateTimeFormat)
es.Created = carbon.Parse(e.Updated)
query := fmt.Sprintf("INSERT INTO %s (`type`, type_id, title, type_subsection_id, releaser, created) VALUES (?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE title=?, created=? RETURNING id", iface.ExternalSourcesTable)
row := s.db.QueryRow(query, es.Type, es.TypeId, es.Title, es.TypeSubsectionId, es.Releaser, created, es.Title, created)
if err = row.Scan(&id); err != nil {
return err
}
fmt.Println("<< ----------------- ", i+1, id, " ----------------- >>")
esModel := table.ExternalSources{Columns: columns}
entry := esModel.InsertOnDuplicate(es, s.db)
entries = append(entries, entry)
//fmt.Printf("%+v\n\n\n", entry)
}
}
return nil
return entries, nil
}
func fetch(endpoint string) (*iface.RutrackerAtomTopic, error) {
func fetch(endpoint string) (*resource.RutrackerAtomTopic, error) {
resp, err := http.Get(endpoint)
if err != nil {
slog.Error("couldn't fetch data", endpoint, err.Error())
@@ -75,7 +74,7 @@ func fetch(endpoint string) (*iface.RutrackerAtomTopic, error) {
}
}(resp.Body)
topic := &iface.RutrackerAtomTopic{}
topic := &resource.RutrackerAtomTopic{}
if err = xml.NewDecoder(resp.Body).Decode(topic); err != nil {
return nil, err

View File

@@ -0,0 +1,34 @@
package table
import (
"fmt"
"strings"
"github.com/jmoiron/sqlx"
"git.amok.space/yevhen/resource-scraper/types/constant"
"git.amok.space/yevhen/resource-scraper/types/model"
)
type ExternalSources struct {
Columns []string
}
func (f *ExternalSources) InsertOnDuplicate(es model.ExternalSources, db *sqlx.DB) model.ExternalSources {
stmt := "INSERT INTO %s (%s) VALUES (%s) ON DUPLICATE KEY UPDATE title=:title, created=:created RETURNING id"
placeholders := strings.Join(f.Columns, ", :")
placeholders = ":" + strings.Replace(placeholders, "`", "", -1)
query := fmt.Sprintf(stmt, constant.ExternalSourcesTable, strings.Join(f.Columns, ", "), placeholders)
//fmt.Printf("%s\n", query)
if rows, err := db.NamedQuery(query, &es); err == nil {
for rows.Next() {
es.Error = rows.StructScan(&es)
}
} else {
es.Error = err
}
return es
}