Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Nitter
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
TeDomum
Nitter
Commits
3b5b4b76
Commit
3b5b4b76
authored
4 years ago
by
Zed
Browse files
Options
Downloads
Patches
Plain Diff
Improve tweet url and hashtag parsing
parent
50218bcc
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/parserutils.nim
+115
-55
115 additions, 55 deletions
src/parserutils.nim
with
115 additions
and
55 deletions
src/parserutils.nim
+
115
−
55
View file @
3b5b4b76
import
strutils
,
times
,
macros
,
htmlgen
,
unicode
,
options
import
strutils
,
times
,
macros
,
htmlgen
,
unicode
,
options
,
algorithm
import
regex
,
packedjson
import
types
,
utils
,
formatters
...
...
@@ -6,9 +6,18 @@ const
unRegex
=
re"(^|[^A-z0-9-_./?])@([A-z0-9_]{1,15})"
unReplace
=
"
$1
<a href=
\"
/
$2
\"
>@
$2
</a>"
htRegex
=
re"(^|[^\w-_./?])([#$])([\w_]+)"
htRegex
=
re"(^|[^\w-_./?])([#
#
$])([\w_]+)"
htReplace
=
"
$1
<a href=
\"
/search?q=%23
$3
\"
>
$2$3
</a>"
type
ReplaceSliceKind
=
enum
rkRemove
,
rkUrl
,
rkHashtag
,
rkMention
ReplaceSlice
=
object
slice
:
Slice
[
int
]
kind
:
ReplaceSliceKind
url
,
display
:
string
template
isNull
*
(
js
:
JsonNode
):
bool
=
js
.
kind
==
JNull
template
notNull
*
(
js
:
JsonNode
):
bool
=
js
.
kind
!=
JNull
...
...
@@ -124,65 +133,97 @@ proc getTombstone*(js: JsonNode): string =
result
=
js
{
"tombstoneInfo"
,
"richText"
,
"text"
}.
getStr
result
.
removeSuffix
(
" Learn more"
)
template
getSlice
(
text
:
string
;
slice
:
seq
[
int
]
):
string
=
text
.
runeSubStr
(
slice
[
0
]
,
slice
[
1
]
-
slice
[
0
]
)
proc
getSlice
(
text
:
string
;
js
:
JsonNode
):
string
=
if
js
.
kind
!=
JArray
or
js
.
len
<
2
or
js
[
0
]
.
kind
!=
JInt
:
return
text
let
slice
=
@[
js
{
0
}.
getInt
,
js
{
1
}.
getInt
]
text
.
getSlice
(
slice
)
proc
expandUrl
(
text
:
var
string
;
js
:
JsonNode
;
tLen
:
int
;
hideTwitter
=
false
)
=
let
u
=
js
{
"url"
}.
getStr
if
u
.
len
==
0
or
u
notin
text
:
return
proc
extractSlice
(
js
:
JsonNode
):
Slice
[
int
]
=
result
=
js
[
"indices"
][
0
]
.
getInt
..
<
js
[
"indices"
][
1
]
.
getInt
proc
extractUrls
(
result
:
var
seq
[
ReplaceSlice
]
;
js
:
JsonNode
;
textLen
:
int
;
hideTwitter
=
false
)
=
let
url
=
js
{
"expanded_url"
}
.
getStr
slice
=
js
{
"indices"
}
[
1
]
.
getInt
url
=
js
[
"expanded_url"
]
.
getStr
slice
=
js
.
extractSlice
if
hideTwitter
and
slice
>=
tLen
and
url
.
isTwitterUrl
:
text
=
text
.
replace
(
u
,
""
)
text
.
removeSuffix
(
' '
)
text
.
removeSuffix
(
'
\n
'
)
if
hideTwitter
and
slice
.
b
.
succ
>=
textLen
and
url
.
isTwitterUrl
:
if
slice
.
a
<
textLen
:
result
.
add
ReplaceSlice
(
kind
:
rkRemove
,
slice
:
slice
)
else
:
text
=
text
.
replace
(
u
,
a
(
shortLink
(
url
),
href
=
url
))
proc
expandMention
(
text
:
var
string
;
orig
:
string
;
js
:
JsonNode
)
=
let
name
=
js
{
"name"
}.
getStr
href
=
'/'
&
js
{
"screen_name"
}.
getStr
uname
=
orig
.
getSlice
(
js
{
"indices"
})
text
=
text
.
replace
(
uname
,
a
(
uname
,
href
=
href
,
title
=
name
))
result
.
add
ReplaceSlice
(
kind
:
rkUrl
,
url
:
url
,
display
:
url
.
shortLink
,
slice
:
slice
)
proc
extractHashtags
(
result
:
var
seq
[
ReplaceSlice
]
;
js
:
JsonNode
)
=
result
.
add
ReplaceSlice
(
kind
:
rkHashtag
,
slice
:
js
.
extractSlice
)
proc
replacedWith
(
runes
:
seq
[
Rune
]
;
repls
:
openArray
[
ReplaceSlice
]
;
textSlice
:
Slice
[
int
]
):
string
=
template
extractLowerBound
(
i
:
int
;
idx
):
int
=
if
i
>
0
:
repls
[
idx
]
.
slice
.
b
.
succ
else
:
textSlice
.
a
result
=
newStringOfCap
(
runes
.
len
)
var
upperBound
=
textSlice
.
b
for
i
,
rep
in
repls
:
result
.
add
$
runes
[
extractLowerBound
(
i
,
i
-
1
)
..
<
rep
.
slice
.
a
]
case
rep
.
kind
of
rkHashtag
:
let
name
=
$
runes
[
rep
.
slice
.
a
.
succ
..
rep
.
slice
.
b
]
symbol
=
$
runes
[
rep
.
slice
.
a
]
result
.
add
a
(
symbol
&
name
,
href
=
"/search?q=%23"
&
name
)
of
rkMention
:
result
.
add
a
(
$
runes
[
rep
.
slice
]
,
href
=
rep
.
url
,
title
=
rep
.
display
)
of
rkUrl
:
result
.
add
a
(
rep
.
display
,
href
=
rep
.
url
)
of
rkRemove
:
discard
let
rest
=
extractLowerBound
(
repls
.
len
,
^
1
)
..
<
textSlice
.
b
if
rest
.
a
<=
rest
.
b
:
result
.
add
$
runes
[
rest
]
proc
deduplicate
(
s
:
var
seq
[
ReplaceSlice
]
)
=
var
len
=
s
.
len
i
=
0
while
i
<
len
:
var
j
=
i
+
1
while
j
<
len
:
if
s
[
i
]
.
slice
.
a
==
s
[
j
]
.
slice
.
a
:
s
.
del
j
dec
len
else
:
inc
j
inc
i
proc
cmp
(
x
,
y
:
ReplaceSlice
):
int
=
cmp
(
x
.
slice
.
a
,
y
.
slice
.
b
)
proc
expandProfileEntities
*
(
profile
:
var
Profile
;
js
:
JsonNode
)
=
let
orig
=
profile
.
bio
orig
=
profile
.
bio
.
toRunes
ent
=
?
js
{
"entities"
}
with
urls
,
ent
{
"url"
,
"urls"
}:
profile
.
website
=
urls
[
0
]
{
"expanded_url"
}.
getStr
var
replacements
=
newSeq
[
ReplaceSlice
]
()
with
urls
,
ent
{
"description"
,
"urls"
}:
for
u
in
urls
:
profile
.
bio
.
expandUrl
(
u
,
orig
.
high
)
for
u
in
urls
:
replacements
.
extractUrls
(
u
,
orig
.
high
)
replacements
.
deduplicate
replacements
.
sort
(
cmp
)
profile
.
bio
=
orig
.
replacedWith
(
replacements
,
0
..
orig
.
len
)
profile
.
bio
=
profile
.
bio
.
replace
(
unRegex
,
unReplace
)
.
replace
(
htRegex
,
htReplace
)
for
mention
in
?
ent
{
"user_mentions"
}:
profile
.
bio
.
expandMention
(
orig
,
mention
)
proc
expandTweetEntities
*
(
tweet
:
Tweet
;
js
:
JsonNode
)
=
let
orig
=
tweet
.
text
orig
=
tweet
.
text
.
toRunes
textRange
=
js
{
"display_text_range"
}
s
lice
=
@[
textRange
{
0
}.
getInt
,
textRange
{
1
}.
getInt
]
textS
lice
=
textRange
{
0
}.
getInt
..
textRange
{
1
}.
getInt
hasQuote
=
js
{
"is_quote_status"
}.
getBool
hasCard
=
tweet
.
card
.
isSome
tweet
.
text
=
tweet
.
text
.
getSlice
(
slice
)
var
replyTo
=
""
if
tweet
.
replyId
!=
0
:
with
reply
,
js
{
"in_reply_to_screen_name"
}:
...
...
@@ -191,26 +232,45 @@ proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
let
ent
=
?
js
{
"entities"
}
var
replacements
=
newSeq
[
ReplaceSlice
]
()
with
urls
,
ent
{
"urls"
}:
for
u
in
urls
:
tweet
.
text
.
expandUrl
(
u
,
slice
[
1
]
,
hasQuote
)
let
urlStr
=
u
[
"url"
]
.
getStr
if
urlStr
.
len
==
0
or
urlStr
notin
tweet
.
text
:
continue
replacements
.
extractUrls
(
u
,
textSlice
.
b
,
hideTwitter
=
hasQuote
)
if
hasCard
and
u
{
"url"
}.
getStr
==
get
(
tweet
.
card
).
url
:
get
(
tweet
.
card
).
url
=
u
{
"expanded_url"
}.
getStr
with
media
,
ent
{
"media"
}:
for
m
in
media
:
tweet
.
text
.
expandUrl
(
m
,
slice
[
1
]
,
hideTwitter
=
true
)
if
"hashtags"
in
ent
or
"symbols"
in
ent
:
tweet
.
text
=
tweet
.
text
.
replace
(
htRegex
,
htReplace
)
for
mention
in
?
ent
{
"user_mentions"
}:
let
name
=
mention
{
"screen_name"
}.
getStr
idx
=
tweet
.
reply
.
find
(
name
)
if
mention
{
"indices"
}
[
0
]
.
getInt
>=
slice
[
0
]
:
tweet
.
text
.
expandMention
(
orig
,
mention
)
if
idx
>
-
1
and
name
!=
replyTo
:
tweet
.
reply
.
delete
idx
elif
idx
==
-
1
and
tweet
.
replyId
!=
0
:
tweet
.
reply
.
add
name
for
m
in
media
:
replacements
.
extractUrls
(
m
,
textSlice
.
b
,
hideTwitter
=
true
)
if
"hashtags"
in
ent
:
for
hashtag
in
ent
[
"hashtags"
]
:
replacements
.
extractHashtags
(
hashtag
)
if
"symbols"
in
ent
:
for
symbol
in
ent
[
"symbols"
]
:
replacements
.
extractHashtags
(
symbol
)
if
"user_mentions"
in
ent
:
for
mention
in
ent
[
"user_mentions"
]
:
let
name
=
mention
{
"screen_name"
}.
getStr
slice
=
mention
.
extractSlice
idx
=
tweet
.
reply
.
find
(
name
)
if
slice
.
a
>=
textSlice
.
a
:
replacements
.
add
ReplaceSlice
(
kind
:
rkMention
,
slice
:
slice
,
url
:
"/"
&
name
,
display
:
mention
[
"name"
]
.
getStr
)
if
idx
>
-
1
and
name
!=
replyTo
:
tweet
.
reply
.
delete
idx
elif
idx
==
-
1
and
tweet
.
replyId
!=
0
:
tweet
.
reply
.
add
name
replacements
.
deduplicate
replacements
.
sort
(
cmp
)
tweet
.
text
=
orig
.
replacedWith
(
replacements
,
textSlice
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment