From f23d5b25e7211365ff938a599606c30c0160f84f Mon Sep 17 00:00:00 2001 From: Jacques Dainat <jacques.dainat@ird.fr> Date: Thu, 6 Mar 2025 21:22:59 +0100 Subject: [PATCH] improve and fix #1 --- docs/pages/bash_manip/bash_manip-5-grep2.md | 6 ++ docs/pages/bash_manip/bash_manip-6-awk.md | 91 ++++++++++++++++++++- docs/pages/bash_manip/bash_manip-7-sed.md | 20 ++--- 3 files changed, 103 insertions(+), 14 deletions(-) diff --git a/docs/pages/bash_manip/bash_manip-5-grep2.md b/docs/pages/bash_manip/bash_manip-5-grep2.md index a43adc8..029c923 100644 --- a/docs/pages/bash_manip/bash_manip-5-grep2.md +++ b/docs/pages/bash_manip/bash_manip-5-grep2.md @@ -10,6 +10,12 @@ ## Searching patterns (grep) +In order to use regular expression in grep you should use the `-E` option: + +```bash +grep -E pattern file +``` + Back to our data file `nat2021.csv` containing first names given to children born in France since 1900. Let's play with some RegEx... diff --git a/docs/pages/bash_manip/bash_manip-6-awk.md b/docs/pages/bash_manip/bash_manip-6-awk.md index e08333a..6e920f7 100644 --- a/docs/pages/bash_manip/bash_manip-6-awk.md +++ b/docs/pages/bash_manip/bash_manip-6-awk.md @@ -12,7 +12,6 @@ `AWK` is a powerful programming language designed for text processing and typically used as a data extraction and reporting tool. Named after its creators (Aho, Weinberger, and Kernighan), `AWK` is particularly well-suited for processing **columnar data** and performing complex text manipulations. - ```bash awk 'BEGIN { Initial action(s) } /pattern/ { by line action(s) } END { final action(s) }' file ``` @@ -41,11 +40,85 @@ RS | The record separator, which determines how awk separates input records. Def FNR | The record number in the current input file (resets for each new file). ARGV | An array containing the command-line arguments passed to awk. +## Built-in Functions + +Awk provides built-in functions for string manipulation and numeric operations, making it a powerful tool for text processing and calculations. + +### Numeric Manipulation + +awk includes several numeric functions for performing mathematical operations. + +| Function | Description +|----------|----------| +sin(x) | Returns the sine of x (x in radians) +cos(x) | Returns the cosine of x (x in radians) +atan2(y, x) | Returns the arctangent of y/x +log(x) | Returns the natural logarithm of x +exp(x) | Returns the exponential of x +sqrt(x) | Returns the square root of x +int(x) | Returns the integer part of x +rand() | Returns a random number between 0 and 1 +srand([x]) | Sets the seed for rand() and returns the previous seed + +### String Manipulation + +Awk provides several string functions to manipulate text. + +| Function | Description +|----------|----------| +length([string]) | Returns the length of the string (or the length of $0 if no string is given) +substr(string, start, [length]) | Returns the substring of string starting at start position with optional length +index(string, search) | Returns the position of search in string, or 0 if not found +match(string, regex) | Returns the position of the match of regex in string, or 0 if no match +split(string, array, [separator]) | Splits string into array elements using separator (default is FS) +tolower(string) | Returns a copy of string with all characters converted to lowercase +toupper(string) | Returns a copy of string with all characters converted to uppercase +sprintf(format, expressions) | Returns a formatted string using format and expressions + ## Programmation in awk -awk is a full-fledged programming language that supports control structures such as if-else and loops, making it powerful for text processing. Here’s a brief overview: +awk is a full-fledged programming language that use two type of data structure (variable and array) and supports control structures such as if-else and loops, making it powerful for text processing. Here’s a brief overview: + +### Data Structure -### if/else statement +**variable** + +```bash +var = string # Assigns a value to a variable +var = $1+$2 # calculation +``` + +**array** + +AWK arrays use keys instead of just numerical indices (like dictionaries in Python). They are dynamic i.e. no need to declare the size; you can add elements anytime. + +```bash +# fill an array +array["fruit1"] = apple; +array["fruit2"] = banana; +array["fruit3"] = cherry; + +# delete an entry +delete array[fruit2] + +# print results +for(key in array){ # Traversing through key array here. + print key,array[key] # Printing index and value of current item +} + +``` + +It is possible to split a String into an Array: + +```bash +awk 'BEGIN { + str = "apple,banana,cherry"; + split(str, myArray, ","); + print myArray[1]; # Output: apple +}' +``` + +### If/else Statement ```bash awk '{ if (condition) { action1 } else { action2 } }' file.txt @@ -76,13 +149,23 @@ You can combine conditions using logical operators: **Pattern Matching with Regular Expressions** -You can use regular expressions with the ~ (matches) or !~ (does not match) operators. +You can use the following approach: + +```bash +awk '/pattern/ { by line action(s) }' file +``` + +Or you can use regular expressions with the ~ (matches) or !~ (does not match) operators. | Operator | Description |----------|----------| ~ | Matches a regex pattern !~ | Does NOT match a regex pattern +```bash +awk '{ if ($2 ~ /pattern/) }' file +``` + ### loop You can also use for loops and while loops in awk. diff --git a/docs/pages/bash_manip/bash_manip-7-sed.md b/docs/pages/bash_manip/bash_manip-7-sed.md index 72670c9..988ba85 100644 --- a/docs/pages/bash_manip/bash_manip-7-sed.md +++ b/docs/pages/bash_manip/bash_manip-7-sed.md @@ -63,7 +63,7 @@ sed 'FLAG/<pattern>/<string>/FLAG' ## Line selection -### Syntax +**Syntax** ```bash sed -n 'line p' file @@ -78,7 +78,7 @@ sed -n 'line p' file | `sed '8,$ p' file` | Print lines from line 8 to the end of the file | | `sed -n '1~8 p' file` | Print from line 1, every 8 lines | `~` not supported by BSD sed (MacOS) -### Exercice +**Exercice** !!! question "Print the header and line 686 529 until the end." @@ -90,7 +90,7 @@ sed -n 'line p' file ## Line deletion -### Syntax +**Syntax** ```bash sed 'line d' file @@ -105,7 +105,7 @@ sed 'line d' file | `sed '8,$ d' file` | Delete lines from line 8 to the end of the file | | `sed '1~8d' file` | Delete from line 1, every 8 lines | `~` not supported by BSD sed (MacOS) -### Exercice +**Exercice** !!! question "Delete everything from line 10 to 686 529." @@ -116,7 +116,7 @@ sed 'line d' file ## Use of Regular Expression -### Syntax +**Syntax** ```bash sed 'RegEx' file @@ -144,7 +144,7 @@ sed 'RegEx' file {n,m} | Matches between n and m occurrences (Extended Regex) | sed -E 's/a{2,4}/c/g' \ | Escapes special characters | sed 's/\./X/g' -### Exercice +**Exercice** !!! question "Select all line that match PIERRE in the 2000s." @@ -155,7 +155,7 @@ sed 'RegEx' file ## Subsitution -### Syntax +**Syntax** ```bash sed 's/pattern/replacement/' file @@ -169,7 +169,7 @@ sed 's/pattern/replacement/' file | `s/pattern/replacement/i` | Substitute the first occurrence of pattern with replacement, ignoring case | | `s/pattern/replacement/gi` | Substitute all occurrences of pattern with replacement, ignoring case | -### Exercice +**Exercice** !!! question "Replace all numbers from last colum by XX" @@ -190,7 +190,7 @@ sed 's/pattern/replacement/' file It is possible to extract part of a line. Let's take the example of the extraction of a value from an attribute (`tag=value`) with tag `Name` of the 9th column of a GFF/GTF file. -### Syntax +**Syntax** ```bash sed -n 's/.*START\([^END]*\)END.*/\1/p' file.txt @@ -208,7 +208,7 @@ sed -n 's/.*START\([^END]*\)END.*/\1/p' file.txt * `\1` Prints the first captured group (here only 1 has been captured). * `p` Explicitly prints the result (only used with -n). -### Exercice +**Exercice** !!! question "List all names that are associated to PIERRE (e.g. OLIVIER that is used to do PIERRE-OLIVER)" -- GitLab