summaryrefslogtreecommitdiff
path: root/internal/processor/txt.go
blob: f230d374ce80e0349a80ef332859595aa0422d00 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package processor

import (
	"fmt"
	"html"
	"os"
	"regexp"
	"strings"
	"time"

	"codeberg.org/snonux/snonux/internal/post"
)

// urlPattern matches http/https URLs in plain text.
// Trailing sentence punctuation is stripped separately by stripURLTrailing.
var urlPattern = regexp.MustCompile(`https?://\S+`)

type txtBuilder struct{}

func (txtBuilder) Plan(srcPath string, ext string) (postPlan, error) {
	plan := postPlan{srcPath: srcPath, ext: ext}
	html, err := processTxt(srcPath)
	if err != nil {
		return postPlan{}, err
	}
	plan.textHTML = html
	return plan, nil
}

func (txtBuilder) Commit(plan postPlan, postDir string, id string, now time.Time) (*post.Post, []string, error) {
	return &post.Post{
		ID:        id,
		Timestamp: now,
		PostType:  post.TypeText,
		Content:   plan.textHTML,
	}, nil, nil
}

func init() {
	register(".txt", txtBuilder{})
}

// processTxt reads a plain-text file and wraps each non-empty paragraph in <p> tags.
// URLs are automatically converted to clickable <a> links.
// Non-URL text is HTML-escaped to prevent XSS.
func processTxt(path string) (string, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return "", fmt.Errorf("read txt %s: %w", path, err)
	}

	raw := strings.TrimSpace(string(data))
	if raw == "" {
		return "<p></p>", nil
	}

	// Split on blank lines to get logical paragraphs.
	paragraphs := strings.Split(raw, "\n\n")
	var sb strings.Builder

	for _, para := range paragraphs {
		trimmed := strings.TrimSpace(para)
		if trimmed == "" {
			continue
		}
		fmt.Fprintf(&sb, "<p>%s</p>\n", formatParagraph(trimmed))
	}

	return sb.String(), nil
}

// formatParagraph formats a single paragraph: auto-links URLs, escapes non-URL
// text, and converts single newlines to <br> line breaks.
func formatParagraph(para string) string {
	lines := strings.Split(para, "\n")
	formatted := make([]string, 0, len(lines))

	for _, line := range lines {
		if t := strings.TrimSpace(line); t != "" {
			formatted = append(formatted, autolinkLine(t))
		}
	}

	return strings.Join(formatted, "<br>\n")
}

// autolinkLine escapes non-URL text and wraps detected URLs in <a> tags.
// Opens in a new tab with rel="noopener noreferrer" for security.
func autolinkLine(line string) string {
	locs := urlPattern.FindAllStringIndex(line, -1)
	if len(locs) == 0 {
		return html.EscapeString(line)
	}

	var sb strings.Builder
	prev := 0

	for _, loc := range locs {
		sb.WriteString(html.EscapeString(line[prev:loc[0]]))

		rawURL := line[loc[0]:loc[1]]
		cleanURL := stripURLTrailing(rawURL)
		trailing := rawURL[len(cleanURL):]

		fmt.Fprintf(&sb, `<a href="%s" target="_blank" rel="noopener noreferrer">%s</a>`,
			html.EscapeString(cleanURL), html.EscapeString(cleanURL))

		if trailing != "" {
			sb.WriteString(html.EscapeString(trailing))
		}

		prev = loc[1]
	}

	sb.WriteString(html.EscapeString(line[prev:]))

	return sb.String()
}

// stripURLTrailing removes common sentence-ending punctuation from the end of a
// URL match. These characters are valid in URLs but almost never appear there
// at the end in prose (e.g. "Visit https://foo.com." — the "." ends the sentence).
func stripURLTrailing(u string) string {
	const cutset = ".,;:!?\"')>]}"

	for len(u) > 0 && strings.ContainsRune(cutset, rune(u[len(u)-1])) {
		u = u[:len(u)-1]
	}

	return u
}