upadted to 1.24.0; fixed stb and spiddped doubles in precene
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-shiori/dom"
|
||||
@@ -30,79 +32,103 @@ func NewPresceneRepository(db *sqlx.DB) *Prescene {
|
||||
func (s *Prescene) GetPage(pageNumbers []string) ([]model.ExternalSources, error) {
|
||||
entries := make([]model.ExternalSources, 0)
|
||||
endpoint := viper.GetString(constant.CfgKeyEndpoint)
|
||||
//scope := viper.GetString(constant.CfgKeyScopeEnable)
|
||||
tags := viper.GetStringMapStringSlice("groups.tags")
|
||||
|
||||
for _, t := range pageNumbers {
|
||||
if t != "1" {
|
||||
endpoint += fmt.Sprintf(viper.GetString(constant.CfgKeyEndpointNext), t)
|
||||
}
|
||||
doc, err := parser.HTMLSourceFromURL(endpoint)
|
||||
//doc, err := parser.HTMLSourceFromURL("https://mdb.amok.space/$/scnlog.html")
|
||||
|
||||
if err != nil {
|
||||
slog.Error("Parse error", "err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
slog.Warn("Document is nil", "err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, item := range dom.QuerySelectorAll(doc, ".post.type-post.category-flac.category-music") {
|
||||
var es model.ExternalSources
|
||||
columns := []string{"`type`", "type_id", "title", "eXsource", "releaser", "created"}
|
||||
|
||||
title := dom.QuerySelector(item, ".title")
|
||||
if title != nil {
|
||||
anchor := dom.QuerySelector(title, "h1 > a")
|
||||
if anchor != nil {
|
||||
es.Type = constant.ScopePrescene
|
||||
es.Title = dom.GetAttribute(anchor, "title")
|
||||
es.ExSource = dom.GetAttribute(anchor, "href")
|
||||
pattern := regexp.MustCompile(`(?is)-(\w+)$`)
|
||||
es.Releaser = pattern.FindStringSubmatch(es.Title)[1]
|
||||
|
||||
for flag, groups := range tags {
|
||||
if slices.Contains(groups, es.Releaser) {
|
||||
es.A = flag
|
||||
es.H = flag
|
||||
columns = append(columns, "a", "h")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if es.A == constant.TagIgnore {
|
||||
slog.Info("Skipped", "releaser", es.Releaser)
|
||||
continue
|
||||
}
|
||||
|
||||
localtime := dom.QuerySelector(title, "small > span.localtime")
|
||||
if localtime != nil {
|
||||
lc := dom.GetAttribute(localtime, "data-lttime")
|
||||
es.Created = carbon.Parse(lc)
|
||||
}
|
||||
uri := viper.GetString(constant.FlagSingleUri)
|
||||
if uri != "" {
|
||||
url := fmt.Sprintf("%s/%s", strings.Trim(endpoint, "/"), strings.Trim(uri, "/"))
|
||||
result, _ := parseUrl(url, s.db)
|
||||
entries = append(entries, result...)
|
||||
} else {
|
||||
for _, t := range pageNumbers {
|
||||
if t != "1" {
|
||||
endpoint += fmt.Sprintf(viper.GetString(constant.CfgKeyEndpointNext), t)
|
||||
}
|
||||
|
||||
cls := dom.GetAttribute(item, "class")
|
||||
pattern := regexp.MustCompile(`(?s)^post-(\d+)\spost`)
|
||||
es.TypeId, _ = strconv.Atoi(pattern.FindStringSubmatch(cls)[1])
|
||||
//doc, err := parser.HTMLSourceFromURL("https://mdb.amok.space/$/scnlog.html")
|
||||
if result, err := parseUrl(endpoint, s.db); err == nil {
|
||||
entries = append(entries, result...)
|
||||
} else {
|
||||
slog.Error("parsing url", "err", err)
|
||||
}
|
||||
|
||||
//fmt.Println("====================== ", i, " ==============================")
|
||||
esModel := table.ExternalSources{Columns: columns}
|
||||
entry := esModel.InsertOnDuplicate(es, s.db)
|
||||
entries = append(entries, entry)
|
||||
|
||||
//fmt.Printf("%+v\n", entry)
|
||||
//fmt.Println("Sleeping...", j)
|
||||
time.Sleep(viper.GetDuration(constant.CfgKeySleepBeforeNextIteration))
|
||||
}
|
||||
|
||||
//fmt.Println("Sleeping...", j)
|
||||
time.Sleep(viper.GetDuration(constant.CfgKeySleepBeforeNextIteration))
|
||||
}
|
||||
|
||||
//fmt.Printf("scope: %v\n", scope)
|
||||
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func parseUrl(endpoint string, db *sqlx.DB) ([]model.ExternalSources, error) {
|
||||
entries := make([]model.ExternalSources, 0)
|
||||
tags := viper.GetStringMapStringSlice("groups.tags")
|
||||
slog.Info("singleton", "url", endpoint)
|
||||
doc, err := parser.HTMLSourceFromURL(endpoint)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if doc == nil {
|
||||
return nil, errors.New("document is nil")
|
||||
}
|
||||
var validID = regexp.MustCompile(`-\d+\/$`)
|
||||
|
||||
for i, item := range dom.QuerySelectorAll(doc, ".post.type-post.category-flac.category-music") {
|
||||
var es model.ExternalSources
|
||||
columns := []string{"`type`", "type_id", "title", "eXsource", "releaser", "created"}
|
||||
|
||||
title := dom.QuerySelector(item, ".title")
|
||||
if title != nil {
|
||||
anchor := dom.QuerySelector(title, "h1 > a")
|
||||
if anchor != nil {
|
||||
es.Type = constant.ScopePrescene
|
||||
es.Title = dom.GetAttribute(anchor, "title")
|
||||
if es.Title == "Auto Draft" {
|
||||
slog.Info("Skipped", "title", es.Title)
|
||||
continue
|
||||
}
|
||||
|
||||
es.ExSource = dom.GetAttribute(anchor, "href")
|
||||
if validID.MatchString(es.ExSource) {
|
||||
continue
|
||||
}
|
||||
|
||||
pattern := regexp.MustCompile(`(?is)-(\w+)$`)
|
||||
es.Releaser = pattern.FindStringSubmatch(es.Title)[1]
|
||||
|
||||
for flag, groups := range tags {
|
||||
if slices.Contains(groups, es.Releaser) {
|
||||
es.A = flag
|
||||
es.H = flag
|
||||
columns = append(columns, "a", "h")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if es.A == constant.TagIgnore {
|
||||
slog.Info("Skipped", "releaser", es.Releaser)
|
||||
continue
|
||||
}
|
||||
|
||||
localtime := dom.QuerySelector(title, "small > span.localtime")
|
||||
if localtime != nil {
|
||||
lc := dom.GetAttribute(localtime, "data-lttime")
|
||||
es.Created = carbon.Parse(lc)
|
||||
}
|
||||
}
|
||||
|
||||
cls := dom.GetAttribute(item, "class")
|
||||
pattern := regexp.MustCompile(`(?s)^post-(\d+)\spost`)
|
||||
es.TypeId, _ = strconv.Atoi(pattern.FindStringSubmatch(cls)[1])
|
||||
|
||||
esModel := table.ExternalSources{Columns: columns}
|
||||
entry := esModel.InsertOnDuplicate(es, db)
|
||||
entries = append(entries, entry)
|
||||
|
||||
fmt.Println("====================== ", i, " ==============================")
|
||||
fmt.Printf("%+v\n", entry)
|
||||
}
|
||||
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
@@ -9,11 +9,13 @@ import (
|
||||
type Repository struct {
|
||||
_interface.Rutracker
|
||||
_interface.Prescene
|
||||
_interface.ShareTheBrutality
|
||||
}
|
||||
|
||||
func New(db *sqlx.DB) *Repository {
|
||||
return &Repository{
|
||||
Rutracker: NewRutrackerRepository(db),
|
||||
Prescene: NewPresceneRepository(db),
|
||||
Rutracker: NewRutrackerRepository(db),
|
||||
Prescene: NewPresceneRepository(db),
|
||||
ShareTheBrutality: NewShareTheBrutalityRepository(db),
|
||||
}
|
||||
}
|
||||
|
||||
154
pkg/repository/stb.go
Normal file
154
pkg/repository/stb.go
Normal file
@@ -0,0 +1,154 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/emersion/go-imap/v2"
|
||||
"github.com/go-shiori/dom"
|
||||
"github.com/golang-module/carbon/v2"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/spf13/viper"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"git.amok.space/yevhen/resource-scraper/helper/parser"
|
||||
"git.amok.space/yevhen/resource-scraper/helper/sugar"
|
||||
"git.amok.space/yevhen/resource-scraper/internal/mail"
|
||||
_table "git.amok.space/yevhen/resource-scraper/pkg/repository/table"
|
||||
"git.amok.space/yevhen/resource-scraper/types/constant"
|
||||
"git.amok.space/yevhen/resource-scraper/types/model"
|
||||
)
|
||||
|
||||
type ShareTheBrutality struct {
|
||||
scope string
|
||||
EmailService mail.EmailService
|
||||
db *sqlx.DB
|
||||
}
|
||||
|
||||
func NewShareTheBrutalityRepository(db *sqlx.DB) *ShareTheBrutality {
|
||||
return &ShareTheBrutality{db: db, scope: constant.ScopeShareTheBrutality}
|
||||
}
|
||||
|
||||
func (s *ShareTheBrutality) GetMail(email string) ([]model.ExternalSources, *mail.EmailService) {
|
||||
s.EmailService = mail.EmailService{
|
||||
User: email,
|
||||
}
|
||||
|
||||
s.EmailService.Login()
|
||||
|
||||
mailboxes := fmt.Sprintf("%s.mailboxes", s.scope)
|
||||
criteria := fmt.Sprintf("%s.search-criteria", s.scope)
|
||||
searchCriteria := &imap.SearchCriteria{
|
||||
Text: viper.GetStringSlice(criteria),
|
||||
}
|
||||
//now := carbon.Now()
|
||||
s.EmailService.ListMessages(viper.GetStringSlice(mailboxes), searchCriteria)
|
||||
//box.CreateMailbox("INBOX/Processed")
|
||||
//s.EmailService.CreateMailbox("Processed/Succeed")
|
||||
//s.EmailService.CreateMailbox("Processed/Failed")
|
||||
//s.EmailService.CreateMailbox("Processed/Suspicious")
|
||||
//s.EmailService.MailboxesList()
|
||||
|
||||
entries := s.Processing(viper.GetStringMapString(fmt.Sprintf("%s.sender", s.scope)))
|
||||
|
||||
return entries, &s.EmailService
|
||||
}
|
||||
|
||||
func (s *ShareTheBrutality) Processing(sender map[string]string) []model.ExternalSources {
|
||||
columns := []string{"`type`", "type_id", "title", "type_subsection_id", "releaser", "created", "fingerprint"}
|
||||
entriesBatched := make([]model.ExternalSources, 0)
|
||||
if len(s.EmailService.Messages) == 0 {
|
||||
return entriesBatched
|
||||
}
|
||||
|
||||
//tmpPath := viper.GetString(fmt.Sprintf("%s.storage.filepath", s.scope))
|
||||
dbType := viper.GetString(fmt.Sprintf("%s.db-type", s.scope))
|
||||
regexPatterns := viper.GetStringMapString(fmt.Sprintf("%s.regex", s.scope))
|
||||
topics := viper.GetStringMap(fmt.Sprintf("%s.topics", s.scope))
|
||||
|
||||
for _, msg := range s.EmailService.Messages {
|
||||
entries := make([]model.ExternalSources, 0)
|
||||
from := msg.Envelope.From[0]
|
||||
subject := msg.Envelope.Subject
|
||||
|
||||
if !(from.Mailbox == sender["mailbox"] && from.Host == sender["host"] && subject == sender["subject"]) {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, section := range msg.BodySection {
|
||||
//sugar.WriteDataToTmpFile(msg.BodySection, tmpPath)
|
||||
|
||||
if section.Bytes != nil {
|
||||
doc, err := parser.HTMLSource(string(section.Bytes))
|
||||
if s.EmailService.CheckErr("parsing message body", err) {
|
||||
continue
|
||||
}
|
||||
|
||||
table := dom.QuerySelector(doc, "body > table:nth-of-type(1n) table:nth-of-type(1n) table:nth-of-type(2n) > tbody")
|
||||
if table == nil {
|
||||
s.EmailService.Warn("dom.QuerySelector had not queried any data, returned nil")
|
||||
continue
|
||||
}
|
||||
|
||||
var es model.ExternalSources
|
||||
for _, td := range dom.QuerySelectorAll(table, "tr > td:nth-child(2)") {
|
||||
anchor := dom.QuerySelector(td, "h2 > a")
|
||||
if anchor == nil {
|
||||
s.EmailService.Warn("dom.QuerySelector couldn't find title")
|
||||
continue
|
||||
}
|
||||
es.Title = sugar.SqueezeLine(dom.InnerHTML(anchor))
|
||||
|
||||
u, err := url.Parse(dom.GetAttribute(anchor, "href"))
|
||||
if s.EmailService.CheckErr("parsing url", err) {
|
||||
continue
|
||||
}
|
||||
|
||||
es.Fingerprint = u.RequestURI()
|
||||
pattern := regexp.MustCompile(regexPatterns["type-id"])
|
||||
typeIdMatch := pattern.FindStringSubmatch(es.Fingerprint)
|
||||
if len(typeIdMatch) != 2 {
|
||||
s.EmailService.Warn("Regexp => typeIdMatch not matched")
|
||||
continue
|
||||
}
|
||||
es.TypeId, _ = strconv.Atoi(typeIdMatch[1])
|
||||
|
||||
sourceData := dom.QuerySelector(td, "p:first-child")
|
||||
if sourceData == nil {
|
||||
s.EmailService.Warn("dom.QuerySelector couldn't find sourceData in paragraph")
|
||||
continue
|
||||
}
|
||||
sourceDataString := html.UnescapeString(sugar.SqueezeLine(dom.InnerHTML(sourceData)))
|
||||
pattern = regexp.MustCompile(regexPatterns["who-genre"])
|
||||
sourceDataMatch := pattern.FindStringSubmatch(sourceDataString)
|
||||
|
||||
if len(sourceDataMatch) != 3 {
|
||||
s.EmailService.Warn("Regexp => sourceData not matched")
|
||||
continue
|
||||
}
|
||||
|
||||
es.Releaser = sourceDataMatch[1]
|
||||
es.Created = carbon.Parse(msg.Envelope.Date.String())
|
||||
es.Type = dbType
|
||||
|
||||
genre := strings.ToLower(sourceDataMatch[2])
|
||||
es.TypeSubsectionId = topics[genre].(int)
|
||||
|
||||
entries = append(entries, es)
|
||||
}
|
||||
|
||||
result, status := _table.BatchInsertOnDuplicate(entries, s.db, columns)
|
||||
if status != constant.StatusFailed {
|
||||
entriesBatched = append(entriesBatched, result...)
|
||||
}
|
||||
|
||||
s.EmailService.MoveMessageToMailbox(msg, status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return entriesBatched
|
||||
}
|
||||
@@ -2,10 +2,14 @@ package table
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/logrusorgru/aurora/v4"
|
||||
|
||||
"git.amok.space/yevhen/resource-scraper/helper/thither"
|
||||
"git.amok.space/yevhen/resource-scraper/types/constant"
|
||||
"git.amok.space/yevhen/resource-scraper/types/model"
|
||||
)
|
||||
@@ -20,8 +24,6 @@ func (f *ExternalSources) InsertOnDuplicate(es model.ExternalSources, db *sqlx.D
|
||||
placeholders = ":" + strings.Replace(placeholders, "`", "", -1)
|
||||
query := fmt.Sprintf(stmt, constant.ExternalSourcesTable, strings.Join(f.Columns, ", "), placeholders)
|
||||
|
||||
//fmt.Printf("%s\n", query)
|
||||
|
||||
if rows, err := db.NamedQuery(query, &es); err == nil {
|
||||
for rows.Next() {
|
||||
es.Error = rows.StructScan(&es)
|
||||
@@ -32,3 +34,45 @@ func (f *ExternalSources) InsertOnDuplicate(es model.ExternalSources, db *sqlx.D
|
||||
|
||||
return es
|
||||
}
|
||||
|
||||
func BatchInsertOnDuplicate(entries []model.ExternalSources, db *sqlx.DB, columns []string) ([]model.ExternalSources, string) {
|
||||
es := &ExternalSources{Columns: columns}
|
||||
typeIds := es.GetTypeIds(entries, db)
|
||||
var status string
|
||||
errCount := 0
|
||||
|
||||
for i := 0; i < len(entries); i++ {
|
||||
entry := es.InsertOnDuplicate(entries[i], db)
|
||||
if entry.Error != nil {
|
||||
slog.Error("insert/update entry", "err", entry.Error)
|
||||
errCount++
|
||||
}
|
||||
|
||||
if !slices.Contains(typeIds, entry.TypeId) {
|
||||
fmt.Printf("%s: %s\n", aurora.Green("ADDED"), aurora.White(entry.Title))
|
||||
}
|
||||
|
||||
entries[i] = es.InsertOnDuplicate(entries[i], db)
|
||||
}
|
||||
|
||||
if errCount == 0 {
|
||||
status = constant.StatusSucceed
|
||||
} else if errCount > 0 && errCount == len(entries) {
|
||||
status = constant.StatusFailed
|
||||
} else {
|
||||
status = constant.StatusSuspicious
|
||||
}
|
||||
|
||||
return entries, status
|
||||
}
|
||||
|
||||
func (f *ExternalSources) GetTypeIds(entries []model.ExternalSources, db *sqlx.DB) []int {
|
||||
var typeIds []int
|
||||
ids := thither.FieldValueToStrSlice(entries, "TypeId")
|
||||
query := fmt.Sprintf("SELECT type_id FROM %s WHERE `type` = '%s' AND type_id IN (%s) LIMIT %d", constant.ExternalSourcesTable, entries[0].Type, strings.Join(ids, ","), len(ids))
|
||||
err := db.Select(&typeIds, query)
|
||||
if err != nil {
|
||||
slog.Error("getting type ids", "err", err)
|
||||
}
|
||||
return typeIds
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user